Skip to content

Commit

Permalink
fix: RSEM-calculate-expression wrapper with FASTQ-files as input + In…
Browse files Browse the repository at this point in the history
…tegration of test case (#970)

<!-- Ensure that the PR title follows conventional commit style (<type>:
<description>)-->
<!-- Possible types are here:
https://github.com/commitizen/conventional-commit-types/blob/master/index.json
-->

### Description

- Replaced deprecated argument for alignment inputs with appropriate
argument (`--bam`-> `--alignments`)
- Fixed wrong argument for FASTQ-input: Deleted `input_bam = False`
- Integrated test-case for FASTQ-input 
- Fixed environment-yaml-file: bowtie is needed for FASTQ-file input
- updated meta.yaml

### Notes
The original provided test-case from the authors for BAM-file inputs
includes only an empty BAM-file (except of the header). My test case
also includes reads, which do not map to the reference sequence (since
the provided reference sequence is too short anyways). So as I
understand, the purpose of these tests is just to ensure, whether the
pipeline is able to run through. However, to make it more robust
additional tests with real working examples would be needed.

Co-authored-by: Küchler, Oliver <oliver.kuechler@bih-charite.de>
Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
  • Loading branch information
3 people committed Jan 20, 2023
1 parent 26271ac commit ea9c897
Show file tree
Hide file tree
Showing 14 changed files with 50 additions and 6 deletions.
1 change: 1 addition & 0 deletions bio/rsem/calculate-expression/environment.yaml
Expand Up @@ -4,3 +4,4 @@ channels:
- nodefaults
dependencies:
- rsem =1.3.3
- bowtie =1.3.1
2 changes: 2 additions & 0 deletions bio/rsem/calculate-expression/meta.yaml
Expand Up @@ -8,6 +8,8 @@ input:
- bam: BAM file with reads aligned to transcriptome
- fq_one: FASTQ file of reads (read_1 for paired-end sequencing)
- fq_two: Optional second FASTQ file of reads (read_2 for paired-end sequencing)
- reference: Index files created by rsem-prepare-reference
- reference_bowtie: Additionally needed for FASTQ input; Index files created (by bowtie-build) from the reference transcriptome
output:
- genes_results: This file contains per-gene quantification data for the sample
- isoforms_results: This file contains per-transcript quantification data for the sample
Expand Down
6 changes: 4 additions & 2 deletions bio/rsem/calculate-expression/test/Snakefile
Expand Up @@ -3,8 +3,10 @@ rule calculate_expression:
# input.bam or input.fq_one must be specified (and if input.fq_one, optionally input.fq_two if paired-end)
# an aligned to transcriptome BAM
bam="mapped/a.bam",
# one of the index files created by rsem-prepare-reference; the file suffix is stripped and passed on to rsem
reference="index/reference.seq",
# Index files created by rsem-prepare-reference
reference=multiext("index/reference", ".grp", ".ti", ".transcripts.fa", ".seq", ".idx.fa", ".n2g.idx.fa"),
# reference_bowtie: Additionally needed for FASTQ input; Index files created (by bowtie-build) from the reference transcriptome
# reference_bowtie=multiext("index/reference", ".1.ebwt", ".2.ebwt", ".3.ebwt", ".4.ebwt", ".rev.1.ebwt", ".rev.2.ebwt"),
output:
# genes_results must end in .genes.results; this suffix is stripped and passed to rsem as an output name prefix
# this file contains per-gene quantification data for the sample
Expand Down
26 changes: 26 additions & 0 deletions bio/rsem/calculate-expression/test/Snakefile_fastq
@@ -0,0 +1,26 @@
rule calculate_expression:
input:
# input.bam or input.fq_one must be specified (and if input.fq_one, optionally input.fq_two if paired-end)
fq_one = "fastq/a_R1.fastq",
fq_two = "fastq/a_R2.fastq",
# Index files created by rsem-prepare-reference
reference=multiext("index/reference", ".grp", ".ti", ".transcripts.fa", ".seq", ".idx.fa", ".n2g.idx.fa"),
# reference_bowtie: Additionally needed for FASTQ input; Index files created (by bowtie-build) from the reference transcriptome
reference_bowtie=multiext("index/reference", ".1.ebwt", ".2.ebwt", ".3.ebwt", ".4.ebwt", ".rev.1.ebwt", ".rev.2.ebwt"),
output:
# genes_results must end in .genes.results; this suffix is stripped and passed to rsem as an output name prefix
# this file contains per-gene quantification data for the sample
genes_results="output/a.genes.results",
# isoforms_results must end in .isoforms.results and otherwise have the same prefix as genes_results
# this file contains per-transcript quantification data for the sample
isoforms_results="output/a.isoforms.results",
params:
# optional, specify if sequencing is paired-end
paired_end=True,
# additional optional parameters to pass to rsem, for example,
extra="--seed 42",
log:
"logs/rsem/calculate_expression/a.log",
threads: 2
wrapper:
"master/bio/rsem/calculate-expression"
4 changes: 4 additions & 0 deletions bio/rsem/calculate-expression/test/fastq/a_R1.fastq
@@ -0,0 +1,4 @@
@test:sequence:a/1
AATAAAGTAAAAGCATGTTCCAGTTTAAGATCAATGCCAAATTTCTGAACTGGGGCATAATACTTACTCTGCATCTTTTACAACACTTCTAAGAGTTAATCACATAGATCGGAAGAGCACACGTC
+
/<BBBF<FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
4 changes: 4 additions & 0 deletions bio/rsem/calculate-expression/test/fastq/a_R2.fastq
@@ -0,0 +1,4 @@
@test:sequence:a/2
TTATTTCATTTTCGTACAAGGTCAAATTCTAGTTACGGTTTAAAGACTTGACCCCGTATTATGAATGAGACGTAGAAAATGTTGTGAAGATTCTCAATTAGTGTATCTAGCCTTCTCGTGTGCAG
+
BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
5 changes: 2 additions & 3 deletions bio/rsem/calculate-expression/wrapper.py
Expand Up @@ -14,13 +14,12 @@
if bam:
if fq_one:
raise Exception("Only input.bam or input.fq_one expected, got both.")
input_bam = " --bam"
input_bam = "--alignments"
input_string = bam
paired_end = snakemake.params.get("paired-end", False)
else:
input_bam = ""
if fq_one:
input_bam = False
if isinstance(fq_one, list):
num_fq_one = len(fq_one)
input_string = ",".join(fq_one)
Expand Down Expand Up @@ -62,7 +61,7 @@
"(rsem will append .isoforms.results suffix)"
)

reference_prefix = os.path.splitext(snakemake.input.reference)[0]
reference_prefix = os.path.splitext(snakemake.input.reference[0])[0]

extra = snakemake.params.get("extra", "")
threads = snakemake.threads
Expand Down
8 changes: 7 additions & 1 deletion test.py
Expand Up @@ -4932,11 +4932,17 @@ def test_collectgcbiasmetrics():


@skip_if_not_modified
def test_calculate_expression():
def test_rsem_calculate_expression():
run(
"bio/rsem/calculate-expression",
["snakemake", "--cores", "1", "--use-conda", "-F"],
)

run(
"bio/rsem/calculate-expression",
["snakemake", "--cores", "1", "--use-conda", "-F",
"-s", "Snakefile_fastq"],
)


@skip_if_not_modified
Expand Down

0 comments on commit ea9c897

Please sign in to comment.