Golob-Minot · jgolob · Apr 29, 2021 · Apr 29, 2021
diff --git a/main.nf b/main.nf
@@ -91,9 +91,6 @@ def helpMessage() {
     For preprocessing:
       --hg_index_url        URL for human genome index, defaults to current HG
       --hg_index            Cached copy of the bwa indexed human genome, TGZ format
-      --adapter_F           Forward sequencing adapter sequence (to be removed)
-      --adapter_R           Reverse sequencing adapter sequence (to be removed)
-                              (Adapter sequences default to nextera adapters)
       --min_hg_align_score  Minimum alignment score for human genome (default 30)
 
     For Assembly:
@@ -180,7 +177,7 @@ include { WriteManifest} from './modules/preprocess' params(
     output: output_folder
 )
 // Import some general tasks, such as CombineReads and writeManifest
-include { read_manifest } from './modules/general'
+include { Read_manifest } from './modules/general'
 include { countReads } from './modules/general'
 include { countReadsSummary } from './modules/general' params(
     output_folder: output_folder
@@ -310,7 +307,7 @@ workflow {
         manifest_file = Channel.from(file(params.manifest))
     }
 
-    manifest_qced = read_manifest(manifest_file)
+    manifest_qced = Read_manifest(manifest_file)
 
     // Phase I: Preprocessing
     if (!params.nopreprocess) {

diff --git a/modules/alignment.nf b/modules/alignment.nf
@@ -202,7 +202,7 @@ process diamond {
     file refdb
 
     output:
-    tuple sample_name, file("${sample_name}.aln.gz")
+    tuple val(sample_name), file("${sample_name}.aln.gz")
 
     """
     set -e
@@ -246,7 +246,7 @@ process famli {
     errorStrategy 'finish'
 
     input:
-    tuple sample_name, file(input_aln)
+    tuple val(sample_name), file(input_aln)
 
     output:
     path "${sample_name}.json.gz"

diff --git a/modules/assembly.nf b/modules/assembly.nf
@@ -209,10 +209,10 @@ process assembly {
     publishDir "${params.output_folder}/assembly/${specimen}", mode: "copy"
 
     input:
-        tuple specimen, file(R1), file(R2)
+        tuple val(specimen), file(R1), file(R2)
 
     output:
-        tuple specimen, file("${specimen}.contigs.fasta.gz"), file("${specimen}.megahit.log")
+        tuple val(specimen), file("${specimen}.contigs.fasta.gz"), file("${specimen}.megahit.log")
 
 """
 set -e 

diff --git a/modules/general.nf b/modules/general.nf
@@ -9,7 +9,7 @@ container__pandas = "quay.io/fhcrc-microbiome/python-pandas:v1.0.3"
 params.fdr_method = "fdr_bh"
 
 // Function to read in a CSV and return a Channel
-def read_manifest(manifest_file){
+def Read_manifest(manifest_file){
     manifest_file.splitCsv(
         header: true, 
         sep: ","
@@ -21,104 +21,8 @@ def read_manifest(manifest_file){
     }
 }
 
-workflow combineReads {
-    take:
 
-        fastq_ch
 
-    main:
-
-        fastq_ch.branch {  // Split up the samples which have multiple FASTQ files
-            single: it[1].size() == 1
-            multiple: it[1].size() > 1
-        }.set {
-            grouped_fastq
-        }
-
-        joinFASTQ(
-            grouped_fastq.multiple
-        )
-
-    emit:
-        grouped_fastq.single.map {
-            r -> [r[0], r[1][0], r[2][0]]
-        }.mix(
-            joinFASTQ.out
-        )
-
-}
-
-process joinFASTQ {
-    tag "Join FASTQ files per-specimen"
-    container "${container__fastatools}"
-    label = 'mem_medium'
-    errorStrategy 'finish'
-    maxRetries 10
-
-    // If the user sets --preprocess_output, write out the combined reads to that folder
-    publishDir path: "${params.output_folder}qc/", enabled: params.savereads, mode: "copy"
-
-    input:
-    tuple val(sample), file("R1.*.fastq.gz"), file("R2.*.fastq.gz")
-
-    output:
-    tuple val(sample), file("${sample}.R1.fastq.gz"), file("${sample}.R2.fastq.gz")
-
-"""
-set -e
-
-ls -lah *
-
-combine_fastq_pairs.py \
--1 R1*fastq.gz \
--2 R2*fastq.gz \
---normalize-ids \
--o1 "${sample}.R1.fastq.gz" \
--o2 "${sample}.R2.fastq.gz"
-
-(( \$(gunzip -c "${sample}.R1.fastq.gz" | head | wc -l) > 1 ))
-(( \$(gunzip -c "${sample}.R2.fastq.gz" | head | wc -l) > 1 ))
-
-"""
-
-}
-
-process outputManifest {
-    container "${container__ubuntu}"
-
-    publishDir path: "${params.output_folder}qc/", enabled: params.savereads, mode: "copy"
-
-    input:
-        val manifestStr
-
-    output:
-        file 'manifest.qc.csv'
-
-    """
-        echo "${manifestStr}" > manifest.qc.csv
-    """
-}
-
-// Workflow to publish a set of reads to a folder, along with a manifest
-workflow writeManifest {
-    take:
-        reads_ch
-
-    main:
-        // Make a manifest for the files in reads_ch
-        // Output the final reads and manifest
-
-
-        manifestStr = reads_ch.reduce(
-            'specimen,R1,R2\n'
-        ){ csvStr, row ->
-            return  csvStr += "${row[0]},${params.output_folder}qc/${row[1].name},${params.output_folder}qc/${row[2].name}\n";
-        }
-
-        // Write the manifest CSV to a file
-        outputManifest(manifestStr)
-
-}
 
 // Count the number of input reads for a single sample
 process countReads {
@@ -129,7 +33,7 @@ process countReads {
     errorStrategy "finish"
 
     input:
-    tuple sample_name, file(R1), file(R2)
+    tuple val(sample_name), file(R1), file(R2)
 
     output:
     file "${sample_name}.countReads.csv"

diff --git a/modules/preprocess.nf b/modules/preprocess.nf
@@ -1,19 +1,12 @@
+nextflow.preview.dsl=2
+
 // Container versions
 container__barcodecop = "quay.io/fhcrc-microbiome/barcodecop:barcodecop_0.5.3"
 container__trimgalore = 'quay.io/biocontainers/trim-galore:0.6.6--0'
 container__bwa = "quay.io/fhcrc-microbiome/bwa:bwa.0.7.17__bcw.0.3.0I"
 container__fastatools = "quay.io/fhcrc-microbiome/fastatools:0.7.1__bcw.0.3.2"
 container__ubuntu = "ubuntu:18.04"
 
-// Defaults
-// Preprocessing options
-params.hg_index_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bwa_index.tar.gz'
-params.hg_index = false
-params.min_hg_align_score = 30
-params.savereads = false
-params.output = 'results/'
-
-
 // Function to filter a manifest to those rows which 
 // have values for specimen, R1, and R2, but are missing any values in I1 or I2
 def filter_no_index(manifest_ch){
@@ -127,15 +120,15 @@ workflow Preprocess_wf {
 process Barcodecop {
     tag "Validate barcode demultiplexing for WGS reads"
     container "${container__barcodecop}"
-    label 'mem_medium'
+    label 'multithread'
     errorStrategy 'finish'
 
     input:
-        tuple specimen, file(R1), file(R2), file(I1), file(I2)
+        tuple val(specimen), file(R1), file(R2), file(I1), file(I2)
 
     output:
-        tuple specimen, file("${R1}.bcc.fq.gz"), file("${R2}.bcc.fq.gz"), emit: bcc_to_cutadapt_ch
-        tuple specimen, file("${R1}.bcc.fq.gz"), file("${R2}.bcc.fq.gz"), emit: bcc_empty_ch
+        tuple val(specimen), file("${R1}.bcc.fq.gz"), file("${R2}.bcc.fq.gz"), emit: bcc_to_cutadapt_ch
+        tuple val(specimen), file("${R1}.bcc.fq.gz"), file("${R2}.bcc.fq.gz"), emit: bcc_empty_ch
 """
 set -e
 
@@ -222,10 +215,10 @@ process BWA_remove_human {
 
     input:
         file hg_index_tgz
-        tuple sample_name, file(R1), file(R2)
+        tuple val(sample_name), file(R1), file(R2)
 
     output:
-        tuple sample_name, file("${R1.getSimpleName()}.noadapt.nohuman.fq.gz"), file("${R2.getSimpleName()}.noadapt.nohuman.fq.gz")
+        tuple val(sample_name), file("${R1.getSimpleName()}.noadapt.nohuman.fq.gz"), file("${R2.getSimpleName()}.noadapt.nohuman.fq.gz")
 
 
 """
@@ -366,4 +359,78 @@ workflow CombineReads {
         )
 
 }
+//
+// Steps to run preprocessing independently.
+//
+
+// Default values for boolean flags
+// If these are not set by the user, then they will be set to the values below
+// This is useful for the if/then control syntax below
+params.nopreprocess = false
+params.savereads = false
+params.help = false
+params.output = './results/'
+params.manifest = null
+
+// Preprocessing options
+params.hg_index_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bwa_index.tar.gz'
+params.hg_index = false
+params.min_hg_align_score = 30
+
+// imports
+include { Read_manifest } from './general'
+
+
+// Function which prints help message text
+def helpMessage() {
+    log.info"""
+    Usage:
+
+    nextflow run Golob-Minot/geneshot/preprocess <ARGUMENTS>
+
+    Required Arguments:
+      --manifest            CSV file listing samples (see below)
+
+    Options:
+      --output              Folder to place analysis outputs (default ./results/)
+      --savereads           If specified, save the preprocessed reads to the output folder (inside qc/)
+      -w                    Working directory. Defaults to `./work`
+
+    For preprocessing:
+      --hg_index_url        URL for human genome index, defaults to current HG
+      --hg_index            Cached copy of the bwa indexed human genome, TGZ format
+      --min_hg_align_score  Minimum alignment score for human genome (default 30)
+
+    Manifest:
+      The manifest is a CSV with a header indicating which samples correspond to which files.
+      The file must contain a column `specimen`. This can be repeated. 
+      Data is only accepted as paired reads.
+      Reads are specified by columns, `R1` and `R2`.
+      If index reads are provided, the column titles should be 'I1' and 'I2'
+
+    """.stripIndent()
+}
+
+
+workflow {
+    main:
+
+
+    // Show help message if the user specifies the --help flag at runtime
+    if (params.help || params.manifest == null){
+        // Invoke the function above which prints the help message
+        helpMessage()
+        // Exit out and do not run anything else
+        exit 0
+    }
+    // Read and validate manifest
+    manifest_file = Channel.from(file(params.manifest))
+    manifest_qced = Read_manifest(manifest_file)
+    // Actually preprocess
+
+    Preprocess_wf(
+        manifest_qced.valid_paired_indexed,
+        manifest_qced.valid_paired
+    )
 
+}