From b653a44d105e4b3799425a695d75a08239dc0d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Mon, 29 Nov 2021 12:30:11 +0100 Subject: [PATCH] docs: short tutorial updates (#1286) * first round of short tutorial updates * update * final updates --- docs/tutorial/short.rst | 352 ++++++++++++++++++++-------------------- 1 file changed, 178 insertions(+), 174 deletions(-) diff --git a/docs/tutorial/short.rst b/docs/tutorial/short.rst index 89b9a3c59..79d26b7ff 100644 --- a/docs/tutorial/short.rst +++ b/docs/tutorial/short.rst @@ -39,7 +39,8 @@ First, create an empty workflow in the current directory with: :: - touch Snakefile + mkdir workflow + touch workflow/Snakefile Once a Snakefile is present, you can perform a dry run of Snakemake with: @@ -77,14 +78,14 @@ You will create a workflow that maps the sequencing samples in the Then, you will call genomic variants over the mapped samples, and create an example plot. -First, create a rule called ``bwa``, with input files +First, create a rule called ``map_reads``, with input files - ``data/genome.fa`` - ``data/samples/A.fastq`` and output file -- ``mapped/A.bam`` +- ``results/mapped/A.bam`` To generate output from input, use the shell command @@ -114,41 +115,41 @@ Upon execution, Snakemake will automatically create that environment, and execute the shell command within. Now, test your workflow by simulating the creation of the file -``mapped/A.bam`` via +``results/mapped/A.bam`` via :: - snakemake --use-conda -n mapped/A.bam + snakemake --use-conda -n results/mapped/A.bam to perform a dry-run and :: - snakemake --use-conda mapped/A.bam --cores 1 + snakemake --use-conda results/mapped/A.bam --cores 1 to perform the actual execution. Step 3 ------ -Now, generalize the rule ``bwa`` by replacing the concrete sample name +Now, generalize the rule ``map_reads`` by replacing the concrete sample name ``A`` with a wildcard ``{sample}`` in input and output file the rule -``bwa``. This way, Snakemake can apply the rule to map any of the three +``map_reads``. This way, Snakemake can apply the rule to map any of the three available samples to the reference genome. -Test this by creating the file ``mapped/B.bam``. +Test this by creating the file ``results/mapped/B.bam``. Step 4 ------ -Next, create a rule ``sort`` that sorts the obtained ``.bam`` file by +Next, create a rule ``sort_alignments`` that sorts the obtained ``.bam`` file by genomic coordinate. The rule should have the input file -- ``mapped/{sample}.bam`` +- ``results/mapped/{sample}.bam`` and the output file -- ``mapped/{sample}.sorted.bam`` +- ``results/mapped/{sample}.sorted.bam`` and uses the shell command @@ -163,13 +164,13 @@ Test your workflow with :: - snakemake --use-conda -n mapped/A.sorted.bam + snakemake --use-conda -n results/mapped/A.sorted.bam and :: - snakemake --use-conda mapped/A.sorted.bam --cores 1 + snakemake --use-conda results/mapped/A.sorted.bam --cores 1 Step 5 ------ @@ -193,17 +194,17 @@ docs {output} + bcftools mpileup -f {input.fa} {input.bam} | bcftools call -mv - > {output} Further, define a new conda environment file with the following content: @@ -214,76 +215,84 @@ Further, define a new conda environment file with the following content: - conda-forge dependencies: - bcftools =1.9 - - samtools =1.9 Step 6 ------ Finally, we strive to calculate some exemplary statistics. This time, we don’t use a shell command, but rather employ Snakemake’s ability to -integrate with scripting languages like R and Python. +integrate with scripting languages like R and Python, and Jupyter notebooks. -First, we create a rule ``stats`` with input file +First, we create a rule ``plot_quals`` with input file -- ``"calls/all.vcf"`` +- ``"results/calls/all.vcf"`` and output file -- ``"plots/quals.svg"``. +- ``"results/plots/quals.svg"``. -Instead of a shell command, we write +Instead of a shell command, we use Snakemake's Jupyter notebook integration by specifying .. code:: python - script: - "scripts/plot-quals.py" + notebook: + "notebooks/plot-quals.py" -and create the corresponding script and its containing folder in our -working directory with +instead of using the ``shell`` directive as before. -:: +Next, we have to define a conda environment for the rule, say +``workflow/envs/stats.yaml``, that provides the required Python packages to +execute the script: + +.. code:: yaml + + channels: + - bioconda + - conda-forge + dependencies: + - pysam =0.17 + - altair =4.1 + - altair_saver =0.5 + - pandas =1.3 + - jupyter =1.0 - mkdir scripts - touch scripts/plot-quals.py +Then, we let Snakemake generate a skeleton notebook for us with -We open the script in the editor and add the following content +.. code:: console + + snakemake --draft-notebook results/plots/quals.svg --cores 1 --use-conda + +Snakemake will print instructions on how to open, edit and execute the notebook. + +We open the notebook in the editor and add the following content .. code:: python - import matplotlib - matplotlib.use("Agg") - import matplotlib.pyplot as plt - from pysam import VariantFile - - quals = [record.qual for record in VariantFile(snakemake.input[0])] - plt.hist(quals) - - plt.savefig(snakemake.output[0]) + import pandas as pd + import altair as alt + from pysam import VariantFile -As you can see, instead of writing a command line parser for passing -parameters like input and output files, you have direct access to the -properties of the rule via a magic ``snakemake`` object, that Snakemake -automatically inserts into the script before executing the rule. + quals = pd.DataFrame({"qual": [record.qual for record in VariantFile(snakemake.input[0])]}) -Finally, we have to define a conda environment for the rule, say -``envs/stats.yaml``, that provides the required Python packages to -execute the script: + chart = alt.Chart(quals).mark_bar().encode( + alt.X("qual", bin=True), + alt.Y("count()") + ) -.. code:: yaml + chart.save(snakemake.output[0]) - channels: - - bioconda - - conda-forge - dependencies: - - pysam =0.15 - - matplotlib =3.1 - - python =3.8 +As you can see, instead of writing a command line parser for passing +parameters like input and output files, you have direct access to the +properties of the rule via a magic ``snakemake`` object, that Snakemake +automatically inserts into the notebook before executing the rule. Make sure to test your workflow with :: - snakemake --use-conda plots/quals.svg --cores 1 + snakemake --use-conda --force results/plots/quals.svg --cores 1 + +Here, the force ensures that the readily drafted notebook is re-executed even if you had already generated the output plot in the interactive mode. Step 7 ------ @@ -295,8 +304,8 @@ define default target files. At the top of your ``Snakefile`` define a rule ``all``, with input files -- ``"calls/all.vcf"`` -- ``"plots/quals.svg"`` +- ``"results/calls/all.vcf"`` +- ``"results/plots/quals.svg"`` and neither a shell command nor output files. This rule simply serves as an indicator of what shall be collected as results. @@ -325,8 +334,8 @@ such that it can be, e.g., emailed as a self-contained document. The reader (e.g., a collaborator of yours) can at any time download the enclosed results from the report for further use, e.g., in a manuscript you write together. In this example, please mark the output file -``"plots/quals.svg"`` for inclusion by replacing it with -``report("plots/quals.svg", caption="report/calling.rst")`` and adding a +``"results/plots/quals.svg"`` for inclusion by replacing it with +``report("results/plots/quals.svg", caption="report/calling.rst")`` and adding a file ``report/calling.rst``, containing some description of the output file. This description will be presented as caption in the resulting report. @@ -334,7 +343,7 @@ report. Threads ~~~~~~~ -The first rule ``bwa`` can in theory use multiple threads. You can make +The first rule ``map_reads`` can in theory use multiple threads. You can make Snakemake aware of this, such that the information can be used for scheduling. Add a directive ``threads: 8`` to the rule and alter the shell command to @@ -349,12 +358,12 @@ to the ``bwa`` process. Temporary files ~~~~~~~~~~~~~~~ -The output of the ``bwa`` rule becomes superfluous once the sorted +The output of the ``map_reads`` rule becomes superfluous once the sorted version of the ``.bam`` file is generated by the rule ``sort``. Snakemake can automatically delete the superfluous output once it is not needed anymore. For this, mark the output as temporary by replacing -``"mapped/{sample}.bam"`` in the rule ``bwa`` with -``temp("mapped/{sample}.bam")``. +``"results/mapped/{sample}.bam"`` in the rule ``bwa`` with +``temp("results/mapped/{sample}.bam")``. Solutions --------- @@ -370,16 +379,16 @@ The rule should look like this: .. code:: python - rule bwa: - input: - "data/genome.fa", - "data/samples/A.fastq" - output: - "mapped/A.bam" - conda: - "envs/mapping.yaml" - shell: - "bwa mem {input} | samtools view -Sb - > {output}" + rule map_reads: + input: + "data/genome.fa", + "data/samples/A.fastq" + output: + "results/mapped/A.bam" + conda: + "envs/mapping.yaml" + shell: + "bwa mem {input} | samtools view -b - > {output}" .. _step-3-1: @@ -390,16 +399,16 @@ The rule should look like this: .. code:: python - rule bwa: - input: - "data/genome.fa", - "data/samples/{sample}.fastq" - output: - "mapped/{sample}.bam" - conda: - "envs/mapping.yaml" - shell: - "bwa mem {input} | samtools view -Sb - > {output}" + rule map_reads: + input: + "data/genome.fa", + "data/samples/{sample}.fastq" + output: + "results/mapped/{sample}.bam" + conda: + "envs/mapping.yaml" + shell: + "bwa mem {input} | samtools view -b - > {output}" .. _step-4-1: @@ -410,15 +419,15 @@ The rule should look like this: .. code:: python - rule sort: - input: - "mapped/{sample}.bam" - output: - "mapped/{sample}.sorted.bam" - conda: - "envs/mapping.yaml" - shell: - "samtools sort -o {output} {input}" + rule sort_alignments: + input: + "results/mapped/{sample}.bam" + output: + "results/mapped/{sample}.sorted.bam" + conda: + "envs/mapping.yaml" + shell: + "samtools sort -o {output} {input}" .. _step-5-1: @@ -429,19 +438,18 @@ The rule should look like this: .. code:: python - samples = ["A", "B", "C"] - - rule call: - input: - fa="data/genome.fa", - bam=expand("mapped/{sample}.sorted.bam", sample=samples) - output: - "calls/all.vcf" - conda: - "envs/calling.yaml" - shell: - "samtools mpileup -g -f {input.fa} {input.bam} | " - "bcftools call -mv - > {output}" + samples = ["A", "B", "C"] + + rule call_variants: + input: + fa="data/genome.fa", + bam=expand("results/mapped/{sample}.sorted.bam", sample=SAMPLES) + output: + "results/calls/all.vcf" + conda: + "envs/calling.yaml" + shell: + "bcftools mpileup -f {input.fa} {input.bam} | bcftools call -mv - > {output}" .. _step-6-1: @@ -452,15 +460,15 @@ The rule should look like this: .. code:: python - rule stats: - input: - "calls/all.vcf" - output: - "plots/quals.svg" - conda: - "envs/stats.yaml" - script: - "scripts/plot-quals.py" + rule plot_quals: + input: + "results/calls/all.vcf" + output: + "results/plots/quals.svg" + conda: + "envs/stats.yaml" + notebook: + "notebooks/plot-quals.py.ipynb" .. _step-7-1: @@ -471,10 +479,10 @@ The rule should look like this: .. code:: python - rule all: - input: - "calls/all.vcf", - "plots/quals.svg" + rule all: + input: + "results/calls/all.vcf", + "results/plots/quals.svg" It has to appear as first rule in the ``Snakefile``. @@ -487,58 +495,54 @@ The complete workflow should look like this: .. code:: python - samples = ["A", "B"] - - - rule all: - input: - "calls/all.vcf", - "plots/quals.svg" - - - rule bwa: - input: - "data/genome.fa", - "data/samples/{sample}.fastq" - output: - temp("mapped/{sample}.bam") - conda: - "envs/mapping.yaml" - threads: 8 - shell: - "bwa mem -t {threads} {input} | samtools view -Sb - > {output}" - - - rule sort: - input: - "mapped/{sample}.bam" - output: - "mapped/{sample}.sorted.bam" - conda: - "envs/mapping.yaml" - shell: - "samtools sort -o {output} {input}" - - - - rule call: - input: - fa="data/genome.fa", - bam=expand("mapped/{sample}.sorted.bam", sample=samples) - output: - "calls/all.vcf" - conda: - "envs/calling.yaml" - shell: - "samtools mpileup -g -f {input.fa} {input.bam} | " - "bcftools call -mv - > {output}" - - rule stats: - input: - "calls/all.vcf" - output: - report("plots/quals.svg", caption="report/calling.rst") - conda: - "envs/stats.yaml" - script: - "scripts/plot-quals.py" + SAMPLES = ["A", "B", "C"] + + rule all: + input: + "results/calls/all.vcf", + "results/plots/quals.svg" + + rule map_reads: + input: + "data/genome.fa", + "data/samples/{sample}.fastq" + output: + "results/mapped/{sample}.bam" + conda: + "envs/mapping.yaml" + shell: + "bwa mem {input} | samtools view -b - > {output}" + + + rule sort_alignments: + input: + "results/mapped/{sample}.bam" + output: + "results/mapped/{sample}.sorted.bam" + conda: + "envs/mapping.yaml" + shell: + "samtools sort -o {output} {input}" + + + rule call_variants: + input: + fa="data/genome.fa", + bam=expand("results/mapped/{sample}.sorted.bam", sample=SAMPLES) + output: + "results/calls/all.vcf" + conda: + "envs/calling.yaml" + shell: + "bcftools mpileup -f {input.fa} {input.bam} | bcftools call -mv - > {output}" + + + rule plot_quals: + input: + "results/calls/all.vcf" + output: + "results/plots/quals.svg" + conda: + "envs/stats.yaml" + notebook: + "notebooks/plot-quals.py.ipynb"