workflows/main.wdl

version 1.0

import "humanwgs_structs.wdl"
import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration
import "sample_analysis/sample_analysis.wdl" as SampleAnalysis
import "cohort_analysis/cohort_analysis.wdl" as CohortAnalysis
import "tertiary_analysis/tertiary_analysis.wdl" as TertiaryAnalysis

workflow humanwgs {
	input {
		Cohort cohort

		ReferenceData reference
		SlivarData? slivar_data

		String deepvariant_version = "1.5.0"
		DeepVariantModel? deepvariant_model

		Int? pbsv_call_mem_gb
		Int? glnexus_mem_gb

		Boolean run_tertiary_analysis = false

		# Backend configuration
		String backend
		String? zones
		String? aws_spot_queue_arn
		String? aws_on_demand_queue_arn
		String? container_registry

		Boolean preemptible
	}

	call BackendConfiguration.backend_configuration {
		input:
			backend = backend,
			zones = zones,
			aws_spot_queue_arn = aws_spot_queue_arn,
			aws_on_demand_queue_arn = aws_on_demand_queue_arn,
			container_registry = container_registry
	}

	RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes

	scatter (sample in cohort.samples) {
		call SampleAnalysis.sample_analysis {
			input:
				sample = sample,
				reference = reference,
				deepvariant_version = deepvariant_version,
				deepvariant_model = deepvariant_model,
				default_runtime_attributes = default_runtime_attributes
		}
	}

	if (length(cohort.samples) > 1) {

		scatter (sample in cohort.samples) {
			String sample_id = sample.sample_id
		}
		
		call CohortAnalysis.cohort_analysis {
			input:
				cohort_id = cohort.cohort_id,
				sample_ids = sample_id,
				aligned_bams = flatten(sample_analysis.aligned_bams),
				svsigs = flatten(sample_analysis.svsigs),
				gvcfs = sample_analysis.small_variant_gvcf,
				reference = reference,
				pbsv_call_mem_gb = pbsv_call_mem_gb,
				glnexus_mem_gb = glnexus_mem_gb,
				default_runtime_attributes = default_runtime_attributes
		}
	}

	if (run_tertiary_analysis && defined(slivar_data) && defined(reference.gnomad_af) && defined(reference.hprc_af) && defined(reference.gff) && defined(reference.population_vcfs)) {
		IndexData slivar_small_variant_input_vcf = select_first([
			cohort_analysis.phased_joint_small_variant_vcf,
			sample_analysis.phased_small_variant_vcf[0]
		])
		IndexData slivar_sv_input_vcf = select_first([
			cohort_analysis.phased_joint_sv_vcf,
			sample_analysis.phased_sv_vcf[0]
		])

		call TertiaryAnalysis.tertiary_analysis {
			input:
				cohort = cohort,
				small_variant_vcf = slivar_small_variant_input_vcf,
				sv_vcf = slivar_sv_input_vcf,
				reference = reference,
				slivar_data = select_first([slivar_data]),
				default_runtime_attributes = default_runtime_attributes
		}
	}

	output {
		# sample_analysis output

		# per movie stats, alignments
		Array[Array[File]] bam_stats = sample_analysis.bam_stats
		Array[Array[File]] read_length_summary = sample_analysis.read_length_summary
		Array[Array[File]] read_quality_summary = sample_analysis.read_quality_summary

		# per sample small variant calls
		Array[IndexData] small_variant_gvcfs = sample_analysis.small_variant_gvcf
		Array[File] small_variant_vcf_stats = sample_analysis.small_variant_vcf_stats
		Array[File] small_variant_roh_out = sample_analysis.small_variant_roh_out
		Array[File] small_variant_roh_bed = sample_analysis.small_variant_roh_bed

		# per sample final phased variant calls and haplotagged alignments
		Array[IndexData] sample_phased_small_variant_vcfs = sample_analysis.phased_small_variant_vcf
		Array[IndexData] sample_phased_sv_vcfs = sample_analysis.phased_sv_vcf
		Array[File] sample_hiphase_stats = sample_analysis.hiphase_stats
		Array[File] sample_hiphase_blocks = sample_analysis.hiphase_blocks
		Array[File] sample_hiphase_haplotags = sample_analysis.hiphase_haplotags
		Array[IndexData] merged_haplotagged_bam = sample_analysis.merged_haplotagged_bam
		Array[File] haplotagged_bam_mosdepth_summary = sample_analysis.haplotagged_bam_mosdepth_summary
		Array[File] haplotagged_bam_mosdepth_region_bed = sample_analysis.haplotagged_bam_mosdepth_region_bed
		
		# per sample trgt outputs
		Array[IndexData] trgt_spanning_reads = sample_analysis.trgt_spanning_reads
		Array[IndexData] trgt_repeat_vcf = sample_analysis.trgt_repeat_vcf
		Array[File] trgt_dropouts = sample_analysis.trgt_dropouts

		# per sample cpg outputs
		Array[Array[File]] cpg_pileup_beds = sample_analysis.cpg_pileup_beds
		Array[Array[File]] cpg_pileup_bigwigs = sample_analysis.cpg_pileup_bigwigs

		# per sample paraphase outputs
		Array[File] paraphase_output_jsons = sample_analysis.paraphase_output_json
		Array[IndexData] paraphase_realigned_bams = sample_analysis.paraphase_realigned_bam
		Array[Array[File]] paraphase_vcfs = sample_analysis.paraphase_vcfs

		# per sample hificnv outputs
		Array[IndexData] hificnv_vcfs = sample_analysis.hificnv_vcf
		Array[File] hificnv_copynum_bedgraphs = sample_analysis.hificnv_copynum_bedgraph
		Array[File] hificnv_depth_bws = sample_analysis.hificnv_depth_bw
		Array[File] hificnv_maf_bws = sample_analysis.hificnv_maf_bw

		# cohort_analysis output
		IndexData? cohort_sv_vcf = cohort_analysis.phased_joint_sv_vcf
		IndexData? cohort_small_variant_vcf = cohort_analysis.phased_joint_small_variant_vcf
		File? cohort_hiphase_stats = cohort_analysis.hiphase_stats
		File? cohort_hiphase_blocks = cohort_analysis.hiphase_blocks

		# tertiary_analysis output
		IndexData? filtered_small_variant_vcf = tertiary_analysis.filtered_small_variant_vcf
		IndexData? compound_het_small_variant_vcf = tertiary_analysis.compound_het_small_variant_vcf
		File? filtered_small_variant_tsv = tertiary_analysis.filtered_small_variant_tsv
		File? compound_het_small_variant_tsv = tertiary_analysis.compound_het_small_variant_tsv
		IndexData? filtered_svpack_vcf = tertiary_analysis.filtered_svpack_vcf
		File? filtered_svpack_tsv = tertiary_analysis.filtered_svpack_tsv
	}

	parameter_meta {
		cohort: {help: "Sample information for the cohort"}
		reference: {help: "Reference genome data"}
		slivar_data: {help: "Data files used for annotation with slivar (required if `run_tertiary_analysis` is set to `true`)"}
		deepvariant_version: {help: "Version of deepvariant to use"}
		deepvariant_model: {help: "Optional deepvariant model file to use"}
		pbsv_call_mem_gb: {help: "Optional amount of RAM in GB for pbsv_call; default 64 for cohorts N<=3, 96 for cohorts N>3"}
		glnexus_mem_gb: {help: "Optional amount of RAM in GB for glnexus; default 30"}
		run_tertiary_analysis: {help: "Run the optional tertiary analysis steps"}
		backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS', 'HPC']"}
		zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"}
		aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"}
		aws_on_demand_queue_arn: {help: "Queue ARN for the on demand batch queue; required if backend is set to 'AWS'"}
		container_registry: {help: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used."}
		preemptible: {help: "Where possible, run tasks preemptibly"}
	}
}