nextflow_schema.json

{
    "$schema": "http://json-schema.org/draft-07/schema",
    "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
    "title": "epi2me-labs/wf-clone-validation",
    "description": "De-novo reconstruction of synthetic plasmid sequences.",
    "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-clone-validation/wf-clone-validation-demo.tar.gz",
    "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-clone-validation/wf-clone-validation-demo/aws.nextflow.config",
    "url": "https://github.com/epi2me-labs/wf-clone-validation",
    "type": "object",
    "definitions": {
        "input_options": {
            "title": "Input Options",
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Parameters for finding and handling input data for analysis.",
            "properties": {
                "fastq": {
                    "type": "string",
                    "title": "FASTQ",
                    "format": "path",
                    "description": "Directory containing fastq input files. May contain fastq files directly or directories name barcodeXX relating to independent samples.",
                    "fa_icon": "fab fa-adobe",
                    "help_text": "If directories named \"barcode*\" are found under the `--fastq` directory the data is assumed to be multiplex and each barcode directory will be processed independently. If `.fastq(.gz)` files are found under the `--fastq` directory the sample is assumed to not be multiplexed. In this second case `--samples` should be a simple name rather than a CSV file."
                },
                "approx_size": {
                    "type": "integer",
                    "title": "Approximate plasmid size",
                    "default": 7000,
                    "description": "Approximate size of the plasmid in base pairs, can also be defined per sample see sample_sheet param",
                    "minimum": 1
                },
                "assm_coverage": {
                    "type": "integer",
                    "title": "Assembly Coverage",
                    "default": 60,
                    "description": "Fold coverage for use per assembly",
                    "minimum": 1,
                    "help_text": "This is the coverage that will be used to subsample reads to use for the assembly."
                },
                "primers": {
                    "type": "string",
                    "format": "file-path",
                    "description": "TSV File containing primers used to find inserts. If left empty then inserts will not be searched for.",
                    "help_text": "Specify one or more primer sets which will be used to find the sequence inserted in the construct. Should be in .tsv format  [primer_name,  5', 3'] with no header. An example primers.tsv for pRham/T7 is available in the data folder of the workflow."
                },
                "analyse_unclassified": {
                    "type": "boolean",
                    "title": "Analyse unclassified reads",
                    "default": false,
                    "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
                    "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
                },
                "basecaller_cfg": {
                    "type": "string",
                    "title": "Basecaller model configuration",
                    "description": "Name of the model that was used to basecall signal data, used to select an appropriate Medaka model.",
                    "help_text": "The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameter. The model list only shows models that are compatible with this workflow.",
                    "default": "dna_r10.4.1_e8.2_400bps_sup@v4.2.0",
                    "enum": [
                        "dna_r10.4.1_e8.2_400bps_sup@v4.2.0",
                        "dna_r10.4.1_e8.2_400bps_hac@v4.2.0",
                        "dna_r10.4.1_e8.2_260bps_hac@v4.1.0",
                        "dna_r10.4.1_e8.2_260bps_sup@v4.1.0",
                        "dna_r10.4.1_e8.2_400bps_hac@v4.1.0",
                        "dna_r10.4.1_e8.2_400bps_sup@v4.1.0",
                        "dna_r10.4.1_e8.2_260bps_hac@v4.0.0",
                        "dna_r10.4.1_e8.2_260bps_sup@v4.0.0",
                        "dna_r10.4.1_e8.2_400bps_hac@v4.0.0",
                        "dna_r10.4.1_e8.2_400bps_sup@v4.0.0",
                        "dna_r10.4.1_e8.2_400bps_hac@v3.5.2",
                        "dna_r10.4.1_e8.2_400bps_sup@v3.5.2",
                        "dna_r9.4.1_e8_fast@v3.4",
                        "dna_r9.4.1_e8_hac@v3.3",
                        "dna_r9.4.1_e8_sup@v3.3",
                        "dna_r10.4.1_e8.2_400bps_hac_prom",
                        "dna_r9.4.1_450bps_hac_prom",
                        "dna_r10.3_450bps_hac",
                        "dna_r10.3_450bps_hac_prom",
                        "dna_r10.4.1_e8.2_260bps_hac",
                        "dna_r10.4.1_e8.2_260bps_hac_prom",
                        "dna_r10.4.1_e8.2_400bps_hac",
                        "dna_r9.4.1_450bps_hac",
                        "dna_r9.4.1_e8.1_hac",
                        "dna_r9.4.1_e8.1_hac_prom"
                    ]
                }
            },
            "allOf": [
                {
                    "required": [
                        "fastq"
                    ]
                },
                {
                    "oneOf": [
                        {
                            "required": [
                                "basecaller_cfg"
                            ]
                        },
                        {
                            "required": [
                                "medaka_model"
                            ]
                        }
                    ]
                }
            ]
        },
        "reference_genome_options": {
            "title": "Reference Genome Options",
            "type": "object",
            "fa_icon": "fas fa-dna",
            "description": "Reference genome related files and options required for the workflow.",
            "properties": {
                "insert_reference": {
                    "type": "string",
                    "format": "file-path",
                    "description": "Optional file containing insert reference sequence which will be used for comparison with consensus insert in the report.",
                    "help_text": "Providing a reference sequence can be useful as a QC on the base-level resolution of the the reconstructed insert sequences."
                },
                "host_reference": {
                    "type": "string",
                    "format": "file-path",
                    "description": "FASTA file, reads which map to it are discarded."
                },
                "regions_bedfile": {
                    "type": "string",
                    "title": "Regions BED file",
                    "format": "file-path",
                    "description": "If a host_reference supplied, add an optional BED file to provide host reference regions that will be masked during filtering."
                }
            }
        },
        "sample_options": {
            "title": "Sample Options",
            "type": "object",
            "description": "Parameters that relate to samples such as sample sheets and sample names.",
            "default": "",
            "properties": {
                "sample_sheet": {
                    "type": "string",
                    "format": "file-path",
                    "title": "Sample sheet",
                    "description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. An optional column `approx_size` can be added to provide size estimates for each sample. When not provided, the `--approx_size` parameter will be used for all samples.",
                    "help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`."
                },
                "sample": {
                    "type": "string",
                    "description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
                },
                "min_barcode": {
                    "type": "integer",
                    "default": 0,
                    "title": "Minimum barcode",
                    "description": "Minimum numeric sample barcode ID to process.",
                    "help_text": "If multiplexed data is provided, this filter can be used to exclude certain samples from processing."
                },
                "max_barcode": {
                    "type": "integer",
                    "default": 192,
                    "title": "Maximum barcode",
                    "description": "Maximum (inclusive) numeric sample barcode ID to process.",
                    "help_text": "If multiplexed data is provided, this filter can be used to exclude certain samples from processing."
                }
            }
        },
        "output_options": {
            "title": "Output Options",
            "type": "object",
            "description": "Parameters for saving and naming workflow outputs.",
            "default": "",
            "properties": {
                "out_dir": {
                    "type": "string",
                    "default": "output",
                    "format": "directory-path",
                    "description": "Directory for output of all user-facing files."
                },
                "prefix": {
                    "type": "string",
                    "description": "The prefix attached to each of the output filenames."
                }
            }
        },
        "advanced_options": {
            "title": "Advanced Options",
            "type": "object",
            "description": "Advanced options for configuring processes inside the workflow.",
            "default": "",
            "properties": {
                "trim_length": {
                    "type": "integer",
                    "default": 150,
                    "description": "Number of base pairs to trim from the both ends of read."
                },
                "medaka_model": {
                    "type": "string",
                    "description": "The name of a Medaka model to use. By default the workflow will select an appropriate Medaka model from the basecaller configuration provided. Entering a name here will override the automated selection and use the Medaka model named here.",
                    "help_text": "The workflow will attempt to map the basecalling model used to a suitable Medaka model. You can override this by providing a model with this option instead."
                },
                "flye_quality": {
                    "type": "string",
                    "default": "nano-hq",
                    "description": "The Flye parameter for quality of input reads, default `nano-hq`: high-quality reads, Guppy5+ SUP or Q20 (<5% error).",
                    "help_text": "Other options include `nano-corr`: reads that were corrected with other methods (<3% error), `nano-raw`: pre-Guppy5 (<20% error).",
                    "enum": [
                        "nano-hq",
                        "nano-corr",
                        "nano-raw"
                    ]
                },
                "db_directory": {
                    "type": "string",
                    "title": "Database directory",
                    "format": "directory-path",
                    "description": "Optional directory containing a gene annotation database.",
                    "help_text": "A default generic annotation is provided in tar.gz format, containing entries from [fpbase](https://www.fpbase.org/), [Swiss-Prot](https://www.expasy.org/resources/uniprotkb-swiss-prot) , [Rfam](https://rfam.org/) and [snapgene](https://www.snapgene.com/)"
                }
            }
        },
        "miscellaneous_options": {
            "title": "Miscellaneous Options",
            "type": "object",
            "description": "Everything else.",
            "default": "",
            "properties": {
                "threads": {
                    "type": "integer",
                    "default": 4,
                    "description": "Maximum number of CPU threads to use per workflow task.",
                    "help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is contrained by the executor configuration."
                },
                "help": {
                    "type": "boolean",
                    "description": "Display help text.",
                    "fa_icon": "fas fa-question-circle",
                    "hidden": true,
                    "default": false
                },
                "version": {
                    "type": "boolean",
                    "default": false,
                    "description": "Display version and exit.",
                    "fa_icon": "fas fa-question-circle",
                    "hidden": true
                },
                "disable_ping": {
                    "type": "boolean",
                    "default": false,
                    "description": "Enable to prevent sending a workflow ping."
                }
            }
        }
    },
    "allOf": [
        {
            "$ref": "#/definitions/input_options"
        },
        {
            "$ref": "#/definitions/reference_genome_options"
        },
        {
            "$ref": "#/definitions/sample_options"
        },
        {
            "$ref": "#/definitions/output_options"
        },
        {
            "$ref": "#/definitions/advanced_options"
        },
        {
            "$ref": "#/definitions/miscellaneous_options"
        }
    ],
    "docs": {
        "intro": "## Introduction\n\nThe workflow accepts FASTQ as the primary input.\n\nOptional inputs:\n\n* A `host_reference` FASTA file.\n  \n* A primers TSV file if they differ from the default pRham/T7.\n\nThe steps of the workflow are as follows - \n\n* If a `host_reference` is provided, [minimap2](https://github.com/lh3/minimap2) is used to align the reads to the `host_reference`, aligned reads are filtered out.\n\n* The reads are then trimmed at the ends using [Seqkit](https://bioinf.shenwei.me/seqkit/) with the provided `--trim_length` parameter. \n\n* The sequences are then downsampled using the tool [Rasusa](https://github.com/mbhall88/rasusa).\n\n* [Trycycler](https://github.com/rrwick/Trycycler) is used to create 3 subsamples which are each assembled using [Flye](https://github.com/fenderglass/Flye).\n\n* If there are concatemers in the assembly these are found using minimap2 and deconcatenated using a Python script. \n\n* Trycycler is used to reconcile the subsampled assemblies into one final assembly. This is polished with [Medaka](https://github.com/nanoporetech/medaka).\n\n* [Seqkit](https://bioinf.shenwei.me/seqkit/) is used to find inserts using the primers supplied to the `--primers` parameter. \n\n* The assembly is annotated using [pLannotate](https://github.com/barricklab/pLannotate) with the default database containing entries from [fpbase](https://www.fpbase.org/), [Swiss-Prot](https://www.expasy.org/resources/uniprotkb-swiss-prot), [Rfam](https://rfam.org/) and [snapgene](https://www.snapgene.com/). \n\n* A quality score for the assembly is provided by Medaka.\n\n* Optionally a reference insert sequence can be provided with --insert_reference, which is aligned to the consensus and any variants are reported by [bcftools](https://samtools.github.io/bcftools/bcftools.html).",
        "links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [singularity](https://docs.sylabs.io/)\n* [minimap2](https://github.com/lh3/minimap2)\n* [Seqkit](https://bioinf.shenwei.me/seqkit/)\n* [Rasusa](https://github.com/mbhall88/rasusa)\n* [Trycycler](https://github.com/rrwick/Trycycler)\n* [Flye](https://github.com/fenderglass/Flye)\n* [Medaka](https://github.com/nanoporetech/medaka)\n* [plannotate tool](https://github.com/barricklab/pLannotate)\n* [bcftools](https://samtools.github.io/bcftools/bcftools.html)"
    }
}