/
nextflow_schema.json
280 lines (280 loc) · 16.7 KB
/
nextflow_schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
"title": "epi2me-labs/wf-clone-validation",
"description": "De-novo reconstruction of synthetic plasmid sequences.",
"demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-clone-validation/wf-clone-validation-demo.tar.gz",
"aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-clone-validation/wf-clone-validation-demo/aws.nextflow.config",
"url": "https://github.com/epi2me-labs/wf-clone-validation",
"type": "object",
"definitions": {
"input_options": {
"title": "Input Options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters for finding and handling input data for analysis.",
"properties": {
"fastq": {
"type": "string",
"title": "FASTQ",
"format": "path",
"description": "Directory containing fastq input files. May contain fastq files directly or directories name barcodeXX relating to independent samples.",
"fa_icon": "fab fa-adobe",
"help_text": "If directories named \"barcode*\" are found under the `--fastq` directory the data is assumed to be multiplex and each barcode directory will be processed independently. If `.fastq(.gz)` files are found under the `--fastq` directory the sample is assumed to not be multiplexed. In this second case `--samples` should be a simple name rather than a CSV file."
},
"approx_size": {
"type": "integer",
"title": "Approximate plasmid size",
"default": 7000,
"description": "Approximate size of the plasmid in base pairs, can also be defined per sample see sample_sheet param",
"minimum": 1
},
"assm_coverage": {
"type": "integer",
"title": "Assembly Coverage",
"default": 60,
"description": "Fold coverage for use per assembly",
"minimum": 1,
"help_text": "This is the coverage that will be used to subsample reads to use for the assembly."
},
"primers": {
"type": "string",
"format": "file-path",
"description": "TSV File containing primers used to find inserts. If left empty then inserts will not be searched for.",
"help_text": "Specify one or more primer sets which will be used to find the sequence inserted in the construct. Should be in .tsv format [primer_name, 5', 3'] with no header. An example primers.tsv for pRham/T7 is available in the data folder of the workflow."
},
"analyse_unclassified": {
"type": "boolean",
"title": "Analyse unclassified reads",
"default": false,
"description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
"help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
},
"basecaller_cfg": {
"type": "string",
"title": "Basecaller model configuration",
"description": "Name of the model that was used to basecall signal data, used to select an appropriate Medaka model.",
"help_text": "The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameter. The model list only shows models that are compatible with this workflow.",
"default": "dna_r10.4.1_e8.2_400bps_sup@v4.2.0",
"enum": [
"dna_r10.4.1_e8.2_400bps_sup@v4.2.0",
"dna_r10.4.1_e8.2_400bps_hac@v4.2.0",
"dna_r10.4.1_e8.2_260bps_hac@v4.1.0",
"dna_r10.4.1_e8.2_260bps_sup@v4.1.0",
"dna_r10.4.1_e8.2_400bps_hac@v4.1.0",
"dna_r10.4.1_e8.2_400bps_sup@v4.1.0",
"dna_r10.4.1_e8.2_260bps_hac@v4.0.0",
"dna_r10.4.1_e8.2_260bps_sup@v4.0.0",
"dna_r10.4.1_e8.2_400bps_hac@v4.0.0",
"dna_r10.4.1_e8.2_400bps_sup@v4.0.0",
"dna_r10.4.1_e8.2_400bps_hac@v3.5.2",
"dna_r10.4.1_e8.2_400bps_sup@v3.5.2",
"dna_r9.4.1_e8_fast@v3.4",
"dna_r9.4.1_e8_hac@v3.3",
"dna_r9.4.1_e8_sup@v3.3",
"dna_r10.4.1_e8.2_400bps_hac_prom",
"dna_r9.4.1_450bps_hac_prom",
"dna_r10.3_450bps_hac",
"dna_r10.3_450bps_hac_prom",
"dna_r10.4.1_e8.2_260bps_hac",
"dna_r10.4.1_e8.2_260bps_hac_prom",
"dna_r10.4.1_e8.2_400bps_hac",
"dna_r9.4.1_450bps_hac",
"dna_r9.4.1_e8.1_hac",
"dna_r9.4.1_e8.1_hac_prom"
]
}
},
"allOf": [
{
"required": [
"fastq"
]
},
{
"oneOf": [
{
"required": [
"basecaller_cfg"
]
},
{
"required": [
"medaka_model"
]
}
]
}
]
},
"reference_genome_options": {
"title": "Reference Genome Options",
"type": "object",
"fa_icon": "fas fa-dna",
"description": "Reference genome related files and options required for the workflow.",
"properties": {
"insert_reference": {
"type": "string",
"format": "file-path",
"description": "Optional file containing insert reference sequence which will be used for comparison with consensus insert in the report.",
"help_text": "Providing a reference sequence can be useful as a QC on the base-level resolution of the the reconstructed insert sequences."
},
"host_reference": {
"type": "string",
"format": "file-path",
"description": "FASTA file, reads which map to it are discarded."
},
"regions_bedfile": {
"type": "string",
"title": "Regions BED file",
"format": "file-path",
"description": "If a host_reference supplied, add an optional BED file to provide host reference regions that will be masked during filtering."
}
}
},
"sample_options": {
"title": "Sample Options",
"type": "object",
"description": "Parameters that relate to samples such as sample sheets and sample names.",
"default": "",
"properties": {
"sample_sheet": {
"type": "string",
"format": "file-path",
"title": "Sample sheet",
"description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. An optional column `approx_size` can be added to provide size estimates for each sample. When not provided, the `--approx_size` parameter will be used for all samples.",
"help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`."
},
"sample": {
"type": "string",
"description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
},
"min_barcode": {
"type": "integer",
"default": 0,
"title": "Minimum barcode",
"description": "Minimum numeric sample barcode ID to process.",
"help_text": "If multiplexed data is provided, this filter can be used to exclude certain samples from processing."
},
"max_barcode": {
"type": "integer",
"default": 192,
"title": "Maximum barcode",
"description": "Maximum (inclusive) numeric sample barcode ID to process.",
"help_text": "If multiplexed data is provided, this filter can be used to exclude certain samples from processing."
}
}
},
"output_options": {
"title": "Output Options",
"type": "object",
"description": "Parameters for saving and naming workflow outputs.",
"default": "",
"properties": {
"out_dir": {
"type": "string",
"default": "output",
"format": "directory-path",
"description": "Directory for output of all user-facing files."
},
"prefix": {
"type": "string",
"description": "The prefix attached to each of the output filenames."
}
}
},
"advanced_options": {
"title": "Advanced Options",
"type": "object",
"description": "Advanced options for configuring processes inside the workflow.",
"default": "",
"properties": {
"trim_length": {
"type": "integer",
"default": 150,
"description": "Number of base pairs to trim from the both ends of read."
},
"medaka_model": {
"type": "string",
"description": "The name of a Medaka model to use. By default the workflow will select an appropriate Medaka model from the basecaller configuration provided. Entering a name here will override the automated selection and use the Medaka model named here.",
"help_text": "The workflow will attempt to map the basecalling model used to a suitable Medaka model. You can override this by providing a model with this option instead."
},
"flye_quality": {
"type": "string",
"default": "nano-hq",
"description": "The Flye parameter for quality of input reads, default `nano-hq`: high-quality reads, Guppy5+ SUP or Q20 (<5% error).",
"help_text": "Other options include `nano-corr`: reads that were corrected with other methods (<3% error), `nano-raw`: pre-Guppy5 (<20% error).",
"enum": [
"nano-hq",
"nano-corr",
"nano-raw"
]
},
"db_directory": {
"type": "string",
"title": "Database directory",
"format": "directory-path",
"description": "Optional directory containing a gene annotation database.",
"help_text": "A default generic annotation is provided in tar.gz format, containing entries from [fpbase](https://www.fpbase.org/), [Swiss-Prot](https://www.expasy.org/resources/uniprotkb-swiss-prot) , [Rfam](https://rfam.org/) and [snapgene](https://www.snapgene.com/)"
}
}
},
"miscellaneous_options": {
"title": "Miscellaneous Options",
"type": "object",
"description": "Everything else.",
"default": "",
"properties": {
"threads": {
"type": "integer",
"default": 4,
"description": "Maximum number of CPU threads to use per workflow task.",
"help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is contrained by the executor configuration."
},
"help": {
"type": "boolean",
"description": "Display help text.",
"fa_icon": "fas fa-question-circle",
"hidden": true,
"default": false
},
"version": {
"type": "boolean",
"default": false,
"description": "Display version and exit.",
"fa_icon": "fas fa-question-circle",
"hidden": true
},
"disable_ping": {
"type": "boolean",
"default": false,
"description": "Enable to prevent sending a workflow ping."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/input_options"
},
{
"$ref": "#/definitions/reference_genome_options"
},
{
"$ref": "#/definitions/sample_options"
},
{
"$ref": "#/definitions/output_options"
},
{
"$ref": "#/definitions/advanced_options"
},
{
"$ref": "#/definitions/miscellaneous_options"
}
],
"docs": {
"intro": "## Introduction\n\nThe workflow accepts FASTQ as the primary input.\n\nOptional inputs:\n\n* A `host_reference` FASTA file.\n \n* A primers TSV file if they differ from the default pRham/T7.\n\nThe steps of the workflow are as follows - \n\n* If a `host_reference` is provided, [minimap2](https://github.com/lh3/minimap2) is used to align the reads to the `host_reference`, aligned reads are filtered out.\n\n* The reads are then trimmed at the ends using [Seqkit](https://bioinf.shenwei.me/seqkit/) with the provided `--trim_length` parameter. \n\n* The sequences are then downsampled using the tool [Rasusa](https://github.com/mbhall88/rasusa).\n\n* [Trycycler](https://github.com/rrwick/Trycycler) is used to create 3 subsamples which are each assembled using [Flye](https://github.com/fenderglass/Flye).\n\n* If there are concatemers in the assembly these are found using minimap2 and deconcatenated using a Python script. \n\n* Trycycler is used to reconcile the subsampled assemblies into one final assembly. This is polished with [Medaka](https://github.com/nanoporetech/medaka).\n\n* [Seqkit](https://bioinf.shenwei.me/seqkit/) is used to find inserts using the primers supplied to the `--primers` parameter. \n\n* The assembly is annotated using [pLannotate](https://github.com/barricklab/pLannotate) with the default database containing entries from [fpbase](https://www.fpbase.org/), [Swiss-Prot](https://www.expasy.org/resources/uniprotkb-swiss-prot), [Rfam](https://rfam.org/) and [snapgene](https://www.snapgene.com/). \n\n* A quality score for the assembly is provided by Medaka.\n\n* Optionally a reference insert sequence can be provided with --insert_reference, which is aligned to the consensus and any variants are reported by [bcftools](https://samtools.github.io/bcftools/bcftools.html).",
"links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [singularity](https://docs.sylabs.io/)\n* [minimap2](https://github.com/lh3/minimap2)\n* [Seqkit](https://bioinf.shenwei.me/seqkit/)\n* [Rasusa](https://github.com/mbhall88/rasusa)\n* [Trycycler](https://github.com/rrwick/Trycycler)\n* [Flye](https://github.com/fenderglass/Flye)\n* [Medaka](https://github.com/nanoporetech/medaka)\n* [plannotate tool](https://github.com/barricklab/pLannotate)\n* [bcftools](https://samtools.github.io/bcftools/bcftools.html)"
}
}