diff --git a/composition_only.nf b/composition_only.nf deleted file mode 100644 index 05cb2f8..0000000 --- a/composition_only.nf +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env nextflow - -/* - Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences) - are present in a microbial community. - - This is a workflow oriented around obtaining the composition of a community via WGS data. - Reuses components (primarily pre-processing) from the broader geneshot. -*/ - -// Using DSL-2 -nextflow.enable.dsl=2 - -// Default values for boolean flags -// If these are not set by the user, then they will be set to the values below -// This is useful for the if/then control syntax below -params.nopreprocess = false -params.savereads = false -params.help = false -params.output = './results' -params.manifest = false - -// Preprocessing options -params.hg_index_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bwa_index.tar.gz' -params.hg_index = false -params.min_hg_align_score = 30 - -// Make sure that --output ends with trailing "/" characters -if (!params.output.endsWith("/")){ - output_folder = params.output.concat("/") -} else { - output_folder = params.output -} - - -// Import the preprocess_wf module -include { Read_manifest } from './modules/general' -include { Preprocess_wf } from './modules/preprocess' params( - hg_index: params.hg_index, - hg_index_url: params.hg_index_url, - min_hg_align_score: params.min_hg_align_score, - output: output_folder, - savereads: params.savereads -) -// Import some general tasks -include { CombineReads } from './modules/preprocess' params( - savereads: params.savereads, - output_folder: output_folder -) - - -// Import from composition_wf module -include { composition_wf } from './modules/composition' params( - output_folder: output_folder -) - -// Function which prints help message text -def helpMessage() { - log.info""" - A workflow oriented around obtaining the composition of a community via WGS data. - Reuses components (primarily pre-processing) from the broader geneshot. - - Usage: - - nextflow run composition_only.nf - - Required Arguments: - --manifest CSV file listing samples (see below) - - Options: - --output Folder to place analysis outputs (default ./results) - --nopreprocess If specified, omit the preprocessing steps (removing adapters and human sequences) - --savereads If specified, save the preprocessed reads to the output folder (inside qc/) - -w Working directory. Defaults to `./work` - - For preprocessing: - --hg_index_url URL for human genome index, defaults to current HG - --hg_index Cached copy of the bwa indexed human genome, TGZ format - --min_hg_align_score Minimum alignment score for human genome (default 30) - - Manifest file: - The manifest is a CSV with a header indicating which samples correspond to which files. - The file must contain a column `specimen`. This can be repeated. - Data for preprocessing is only accepted as paired reads. - Reads are specified by columns, `R1` and `R2`. - If index reads are provided, the column titles should be `I1` and `I2` - If you wish to provide already processed data in fasta format, please include it in `R1` alone, - with only *one* file specified per specimen. - """.stripIndent() -} - -// Show help message if the user specifies the --help flag at runtime -if (params.help || !params.manifest){ - // Invoke the function above which prints the help message - helpMessage() - // Exit out and do not run anything else - exit 0 -} - - -workflow { - main: - - // Phase 0: Validation of input data - manifest_file = Channel.from(file(params.manifest)) - // Read manifest splits out our manifest. - manifest_qced = Read_manifest(manifest_file) - - // Phase I: Preprocessing - if (!params.nopreprocess) { - // Run the entire preprocessing workflow - Preprocess_wf( - manifest_qced.valid_paired_indexed, - manifest_qced.valid_paired - ) - // Combine the reads by specimen name - CombineReads(Preprocess_wf.out.groupTuple()) - - } else { - // If the user specified --nopreprocess, then just - // read the manifest and combine by specimen - CombineReads( - manifest_qced.valid_paired.mix(manifest_qced.valid_paired_indexed) - .map { - r -> [r.specimen, file(r.R1), file(r.R2)] - }.groupTuple() - ) - - } - - // ################ - // # Composition # - // ################ - composition_wf( - CombineReads.out, - manifest_qced.valid_unpaired.map{ r-> - [r.specimen, file(r.R1)] - } - ) - -} diff --git a/gene_abund.nf b/gene_abund.nf deleted file mode 100644 index 3867ed4..0000000 --- a/gene_abund.nf +++ /dev/null @@ -1,363 +0,0 @@ -#!/usr/bin/env nextflow - -/* - Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences) - are present in a microbial community. - - This utility extracts the proportion of gene copies from each specimen which are - annotated with a given function by the eggNOG-mapper functional annotation tool. - - To use this utility, provide a set of previously-generated geneshot results to query, - as well as a string which will be used to select all eggNOG annotations which contain it. -*/ - -// Using DSL-2 -nextflow.enable.dsl=2 - -// Parameters -params.results_hdf = false -params.details_hdf = false -params.genes_fasta = false -params.output_folder = false -params.output_prefix = false -params.query = false -params.help = false - -// Function which prints help message text -def helpMessage() { - log.info""" - This utility extracts the proportion of gene copies from each specimen which are - annotated with a given function by the eggNOG-mapper functional annotation tool. - - To use this utility, provide a set of previously-generated geneshot results to query, - as well as a string which will be used to select all eggNOG annotations which contain it. - - Usage: - - nextflow run Golob-Minot/geneshot/gene_abund.nf - - Options: - --results_hdf Location for results.hdf5 generated by geneshot - --details_hdf Location for details.hdf5 generated by geneshot - --genes_fasta Location for input 'genes.fasta.gz' - --output_folder Location for output files - --output_prefix Prefix for output files - --query Query string to use to subset eggNOG gene descriptions - - """.stripIndent() -} - -// Show help message if the user specifies the --help flag at runtime -if (params.help || params.results_hdf == false || params.details_hdf == false || params.genes_fasta == false || params.output_prefix == false || params.output_folder == false || params.query == false){ - // Invoke the function above which prints the help message - helpMessage() - // Exit out and do not run anything else - exit 0 -} - -workflow { - - // Make sure we can find the input files - if(file(params.results_hdf).isEmpty()){ - log.info"""Cannot find input file ${params.results_hdf}""".stripIndent() - exit 0 - } - if(file(params.details_hdf).isEmpty()){ - log.info"""Cannot find input file ${params.details_hdf}""".stripIndent() - exit 0 - } - if(file(params.genes_fasta).isEmpty()){ - log.info"""Cannot find input file ${params.genes_fasta}""".stripIndent() - exit 0 - } - - // Get the table of genes which contain this annotation - extractAnnotations( - file(params.results_hdf) - ) - - // Get the sequences of the genes which match this query - extractFASTA( - extractAnnotations.out, - file(params.genes_fasta) - ) - - // Get the user-provided manifest - extractManifest( - file(params.results_hdf) - ) - - // Get the proportion of gene copies for these genes across all specimens - extractAbund( - extractAnnotations.out, - file(params.details_hdf), - extractManifest.out - ) - -} - -process extractAnnotations { - - container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5" - label "mem_medium" - - input: - file results_hdf - - output: - file "${params.output_prefix}.genes.csv" - - publishDir "${params.output_folder}", mode: 'copy', overwrite: true - -"""#!/usr/bin/env python3 - -import os -import pandas as pd - -# Set the input path -results_hdf = '${results_hdf}' - -# Make sure that the file is present in the working folder -assert os.path.exists(results_hdf) - -# Set up a function to filter a DataFrame by the query string -query_str = '${params.query}' -def filter_df(df): - return df.loc[ - df['eggNOG_desc'].fillna('').apply(lambda s: query_str in s) - ] - -# Read in the table in chunks and filter as we go -df = pd.concat([ - filter_df(chunk_df) - for chunk_df in pd.read_hdf( - results_hdf, - '/annot/gene/all', - iterator=True - ) -]) -print("Number of genes containing the query '%s': %d" % (query_str, df.shape[0])) - -# If there are any genes matching this string -if df.shape[0] > 0: - - # Write out the smaller table - df.to_csv("${params.output_prefix}.genes.csv", index=None) - - print("Done") - -else: - - print("NO GENES FOUND MATCHING THE QUERY: %s" % query_str) - -""" - -} - -process extractManifest { - - container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5" - label "mem_medium" - - input: - file results_hdf - - output: - file "${params.output_prefix}.manifest.csv" - - publishDir "${params.output_folder}", mode: 'copy', overwrite: true - -"""#!/usr/bin/env python3 - -import os -import pandas as pd - -# Set the input path -results_hdf = '${results_hdf}' - -# Make sure that the file is present in the working folder -assert os.path.exists(results_hdf) - -# Read the manifest -manifest_df = pd.read_hdf(results_hdf, "/manifest") - -# Write out to a file -manifest_df.to_csv("${params.output_prefix}.manifest.csv", index=None) - -""" - -} - -process extractFASTA { - - container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5" - label "io_limited" - - input: - file gene_csv - file gene_fasta_gz - - output: - file "${params.output_prefix}.genes.fasta.gz" - - publishDir "${params.output_folder}", mode: 'copy', overwrite: true - -"""#!/usr/bin/env python3 - -from Bio.SeqIO.FastaIO import SimpleFastaParser -import gzip -import os -import pandas as pd - -# Set the input paths -# Make sure that the files are present in the working folder -gene_csv = '${gene_csv}' -assert os.path.exists(gene_csv) -gene_fasta_gz = '${gene_fasta_gz}' -assert os.path.exists(gene_fasta_gz) -output_fasta_gz = '${params.output_prefix}.genes.fasta.gz' -assert not os.path.exists(output_fasta_gz) - -# Read in the table -df = pd.read_csv(gene_csv) -print("Read in annotations for %d genes" % df.shape[0]) - -# Get the list of genes -gene_names = set(df['gene'].tolist()) - -# Keep a counter of how many genes we've found -n_found = 0 - -# Open the input and output files -with gzip.open(gene_fasta_gz, 'rt') as handle_in, gzip.open(output_fasta_gz, 'wt') as handle_out: - - # Iterate over the inputs - for gene_name, gene_seq in SimpleFastaParser(handle_in): - - # If this is one of the genes we are looking for - if gene_name in gene_names: - - # Write it out - handle_out.write(">%s\\n%s\\n" % (gene_name, gene_seq)) - - # Increment the counter - n_found += 1 - -# Report the number of genes which were found -print("Wrote out %d gene sequences" % n_found) -print("DONE") - -""" - -} - -process extractAbund { - - container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5" - label "mem_medium" - - input: - file gene_csv - file details_hdf - file manifest_csv - - output: - file "${params.output_prefix}.*.csv.gz" - - publishDir "${params.output_folder}", mode: 'copy', overwrite: true - -"""#!/usr/bin/env python3 - -import os -import pandas as pd - -# Set the input paths -# Make sure that the files are present in the working folder -gene_csv = '${gene_csv}' -assert os.path.exists(gene_csv) -details_hdf = '${details_hdf}' -assert os.path.exists(details_hdf) -manifest_csv = '${manifest_csv}' -assert os.path.exists(manifest_csv) - -# Read in the table of gene annotations -df = pd.read_csv(gene_csv) -print("Read in annotations for %d genes" % df.shape[0]) - -# Get the list of genes -gene_names = set(df['gene'].tolist()) - -# Read in the manifest -manifest_df = pd.read_csv(manifest_csv) - -# Keep the complete set of gene abundances in long format -output = [] - -# Open a connection to the HDF store -with pd.HDFStore(details_hdf, 'r') as store: - - # Set up an object to save the proportion of gene copies in each specimen - for specimen_name in manifest_df['specimen'].unique(): - - print("Reading in abundances for specimen '%s'" % specimen_name) - - # Read in the full table - specimen_df = pd.read_hdf(store, "/abund/gene/long/%s" % specimen_name) - - # Get the total depth for all gene copies - tot = specimen_df['depth'].sum() - - # Subset to the genes of interest, add the specimen, add the proportion of gene copies, and append to the output - output.append( - specimen_df.loc[ - specimen_df['id'].isin(gene_names) - ].assign( - specimen = specimen_name, - prop = lambda d: d['depth'] / tot - ) - ) - -# Combine all of the output -print("Combining all outputs") -output = pd.concat( - output -).reset_index( - drop=True -) - -# Add the eggNOG annotation -print("Adding eggNOG names") -output = output.assign( - eggNOG_desc = output['id'].apply( - df.set_index('gene')['eggNOG_desc'].get - ) -) - -# Save the long output -print("Saving long output") -output.to_csv( - "${params.output_prefix}.long.csv.gz", - index=None, - compression='gzip' -) - -# Save the wide output -output.pivot_table( - index="specimen", - columns="eggNOG_desc", - values="prop", - aggfunc=sum -).fillna( - 0 -).reset_index( -).to_csv( - "${params.output_prefix}.wide.csv.gz", - index=None, - compression='gzip' -) - -print("DONE") - -""" - -} \ No newline at end of file diff --git a/run_annotations.nf b/run_annotations.nf deleted file mode 100755 index 2f46059..0000000 --- a/run_annotations.nf +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env nextflow - -/* - Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences) - are present in a microbial community. - - This utility performs taxonomic and/or functional annotations on an existing geneshot - results object -*/ - -// Using DSL-2 -nextflow.enable.dsl=2 - -// Parameters -params.input_hdf = false -params.gene_fasta = false -params.output_hdf = false -params.output_folder = false -params.help = false -params.min_coverage = 50 -params.min_identity = 90 -params.taxonomic_dmnd = false -params.ncbi_taxdump = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" -params.eggnog_db = false -params.eggnog_dmnd = false -params.noannot = false - -// Function which prints help message text -def helpMessage() { - log.info""" - Add functional and/or taxonomic annotations to a geneshot output file. - This utility will create a new geneshot output file in which - the only difference is the annotation output results stored within. - - Usage: - - nextflow run Golob-Minot/geneshot/run_annotations.nf - - Options: - --input_hdf Location for input HDF - --gene_fasta Location for input 'genes.fasta.gz' - --output_folder Location for output HDF - --output_hdf Name of output HDF to be placed in the output folder - --taxonomic_dmnd Database used for taxonomic annotation (default: false) - (Data available at s3://fh-ctr-public-reference-data/tool_specific_data/geneshot/2020-01-15-geneshot/DB.refseq.tax.dmnd) - --ncbi_taxdump Reference describing the NCBI Taxonomy - (default: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz) - --eggnog_dmnd One of two databases used for functional annotation with eggNOG (default: false) - (Data available at s3://fh-ctr-public-reference-data/tool_specific_data/geneshot/2020-06-17-eggNOG-v5.0/eggnog_proteins.dmnd) - --eggnog_db One of two databases used for functional annotation with eggNOG (default: false) - (Data available at s3://fh-ctr-public-reference-data/tool_specific_data/geneshot/2020-06-17-eggNOG-v5.0/eggnog.db) - - """.stripIndent() -} - -// Show help message if the user specifies the --help flag at runtime -if (params.help || params.input_hdf == false || params.gene_fasta == false || params.output_hdf == false || params.output_folder == false){ - // Invoke the function above which prints the help message - helpMessage() - // Exit out and do not run anything else - exit 0 -} - -// Show help message if the user does not specify any annotations -if (params.taxonomic_dmnd == false && params.eggnog_dmnd == false && params.eggnog_db == false){ - // Invoke the function above which prints the help message - helpMessage() - // Exit out and do not run anything else - exit 0 -} - -// Import the process used to add annotation results to the output -include { readTaxonomy } from './modules/general' -include { addTaxResults } from './modules/general' - -include { addEggnogResults } from './modules/general' - -include { repackHDF } from './modules/general' params( - output_folder: params.output_folder -) - -// Import the workflows used for annotation -include { Annotation_wf } from './modules/annotation' params( - output_folder: params.output_folder, - eggnog_db: params.eggnog_db, - eggnog_dmnd: params.eggnog_dmnd, - taxonomic_dmnd: params.taxonomic_dmnd, - noannot: params.noannot -) - -// Process to rename the input HDF file -process renameHDF{ - container "ubuntu:20.04" - label 'io_limited' - - input: - path "INPUT.RESULTS.HDF" - val new_file_name - - output: - path "${new_file_name}" - -"""#!/bin/bash - -mv INPUT.RESULTS.HDF ${new_file_name} -""" -} - - -workflow { - - // Make sure we can find the input files - if(file(params.input_hdf).isEmpty()){ - log.info"""Cannot find input file ${params.input_hdf}""".stripIndent() - exit 0 - } - if(file(params.gene_fasta).isEmpty()){ - log.info"""Cannot find input file ${params.gene_fasta}""".stripIndent() - exit 0 - } - - // Run the annotation steps on the gene catalog - Annotation_wf( - file(params.gene_fasta) - ) - - // Rename the input HDF - renameHDF( - file(params.input_hdf), - params.output_hdf - ) - outputHDF = renameHDF.out - - // Add the eggNOG results - if (Annotation_wf.out.eggnog_tsv != false && params.eggnog_db != false && params.eggnog_dmnd != false) { - addEggnogResults( - outputHDF, - Annotation_wf.out.eggnog_tsv - ) - outputHDF = addEggnogResults.out - } - - // Add the taxonomic assignment results - if (Annotation_wf.out.tax_tsv != false && params.ncbi_taxdump != false && params.taxonomic_dmnd != false) { - readTaxonomy( - file(params.ncbi_taxdump) - ) - - addTaxResults( - outputHDF, - Annotation_wf.out.tax_tsv, - readTaxonomy.out - ) - - outputHDF = addTaxResults.out - } - - // Repack the HDF - repackHDF( - outputHDF - ) - -} \ No newline at end of file diff --git a/run_corncob.nf b/run_corncob.nf deleted file mode 100644 index fee6620..0000000 --- a/run_corncob.nf +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env nextflow - -/* - Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences) - are present in a microbial community. - - This utility runs the corncob algorithm with a specified formula on an - existing geneshot output file object. -*/ - -// Using DSL-2 -nextflow.enable.dsl=2 - -// Parameters -params.input_hdf = false -params.input_folder = false -params.output_folder = false -params.output_prefix = false -params.formula = false -params.fdr_method = "fdr_bh" -params.corncob_batches = 10 -params.help = false - -// Docker containers -container__pandas = "quay.io/fhcrc-microbiome/python-pandas:v1.0.3" - -// Function which prints help message text -def helpMessage() { - log.info""" - Run corncob with a specified formula on a geneshot output file. - This utility will create a new geneshot output file in which - the only difference is the corncob output results stored within. - - Usage: - - nextflow run Golob-Minot/geneshot/run_corncob.nf - - Options: - --input_hdf Geneshot results HDF5 file to be used for analysis - --input_folder Folder containing geneshot output to be used for analysis (must contain the subfolder "abund/") - --output_folder Folder to place output tile - --output_prefix Text used as a prefix for summary HDF5 output files (the .corncob.hdf5 suffix will be attached) - -w Working directory. Defaults to `./work` - - For Statistical Analysis: - --formula Optional formula used to estimate associations with CAG relative abundance - --fdr_method FDR method used to calculate q-values for associations (default: 'fdr_bh') - --corncob_batches Number of parallel processes to use processing each formula - - """.stripIndent() -} - -// Show help message if the user specifies the --help flag at runtime -if (params.help || params.input_hdf == false || params.input_folder == false || params.output_folder == false || params.output_prefix == false || params.formula == false){ - // Invoke the function above which prints the help message - helpMessage() - // Exit out and do not run anything else - exit 0 -} - -// Import the process used to shard CAGs for corncob -include { splitCorncob } from './modules/statistics' params( - corncob_batches: params.corncob_batches -) - -// Import the process used for statistical analysis -include { runCorncob } from './modules/statistics' params( - formula: params.formula -) - -// Import the process used to join the results for multiple formulae -include { joinCorncob } from './modules/statistics' params( - output_folder: params.output_folder -) - -// Import the process used to aggregate readcounts across all samples -// This process is only run if `abund/CAG.readcounts.csv.gz` can not be found yet -// After the process completes, a new `abund/CAG.readcounts.csv.gz` will be added to that input folder -include { extractCounts } from './modules/statistics' params( - output_folder: params.input_folder -) - -// Import the processes needed to run meta-analysis of corncob results by annotation -include { runBetta } from './modules/statistics' -include { addBetta } from './modules/statistics' params( - fdr_method: params.fdr_method -) - -// Import the process used to add corncob results to the output -include { repackHDF } from './modules/general' params( - output_folder: params.output_folder -) -include { addCorncobResults } from './modules/general' params( - fdr_method: params.fdr_method -) - -// Process to update the formula listed in the summary table -// Also extract the manifest to reduce the number of times -// the file is opened, as well as the table listing the genes -// which make up each CAG. -process updateFormula{ - container "${container__pandas}" - label 'mem_medium' - errorStrategy 'finish' - - input: - path results_hdf - - output: - path "*corncob.hdf5" - path "manifest.csv" - path "CAG.assignments.csv.gz" - -""" -#!/usr/bin/env python3 - -import pandas as pd -import shutil -import pickle -pickle.HIGHEST_PROTOCOL = 4 - -# Come up with a new name for the output file -formula_name = "${params.formula}".replace( - " ", "_" -).replace( - "+", "_" -).replace( - ",", "_" -).replace( - "*", "_" -).replace( - ":", "_" -).replace( - "__", "_" -).replace( - "__", "_" -) -new_file_name = "${params.output_prefix}.%s.corncob.hdf5" % formula_name - -# Copy the input file to a new output file object -print("Copying %s to %s" % ("${results_hdf}", new_file_name)) -shutil.copyfile("${results_hdf}", new_file_name) - -# Open a connection to the store -with pd.HDFStore(new_file_name, "a") as store: - - # Read in the existing summary table - df = pd.read_hdf(store, "/summary/experiment") - - print("Existing summary:") - print(df) - - # Add the formula - df = pd.concat([ - df.query("variable != 'formula'"), - pd.DataFrame([dict(variable='formula', value='${params.formula}')]) - ]) - - print("") - print("New summary:") - print(df) - - # Write back to the store - df.to_hdf(store, "/summary/experiment") - - # Extract the manifest - print("Extracting the manifest") - pd.read_hdf(store, "/manifest").to_csv("manifest.csv") - - # Extract the table of genes making up each CAG - print("Extracting the gene~CAG table") - pd.read_hdf( - store, - "/annot/gene/all", - columns=["gene", "CAG"] - ).to_csv( - "CAG.assignments.csv.gz", - index=None - ) - -print("Done") -""" - -} - -workflow { - - // Make sure we can find the input file - if(file(params.input_hdf).isEmpty()){ - log.info"""Cannot find input file ${params.input_hdf}""".stripIndent() - exit 0 - } - - // First update the formula listed in the geneshot results file, - // a process which will also extract the manifest CSV - updateFormula( - file(params.input_hdf) - ) - - // Check to see if `abund/CAG.readcounts.csv.gz` is present in the input folder - // For context, this file is only created by geneshot for runs which originally - // included a `--formula`. - // If this file cannot be found, then we will run a process to create that file - // and also to publish that file back to the input folder. - abund_folder = "${params.input_folder.replaceAll(/\/$/, "")}/abund" - if(file("${abund_folder}/CAG.readcounts.csv.gz").isEmpty()){ - extractCounts( - Channel.fromPath( - "${abund_folder}/details/*json.gz" - ).toSortedList(), - updateFormula.out[2] - ) - input_csv = extractCounts.out - } else { - input_csv = file("${abund_folder}/CAG.readcounts.csv.gz") - } - - // Set up a channel with the strings of the formula(s) provided - formula_ch = Channel.of( - params.formula.split(",") - ) - - splitCorncob( - input_csv - ) - - // Now run corncob on the extracted manifest, as well as the gene counts table - runCorncob( - splitCorncob.out.flatten(), - updateFormula.out[1], - formula_ch - ) - - joinCorncob( - runCorncob.out.toSortedList() - ) - - // Add those results to the output file - addCorncobResults( - updateFormula.out[0], - joinCorncob.out - ) - - runBetta( - addCorncobResults.out[1].flatten() - ) - - addBetta( - addCorncobResults.out[0], - runBetta.out.toSortedList() - ) - - // Repack the HDF - repackHDF( - addBetta.out[0] - ) - -} \ No newline at end of file diff --git a/update_manifest.nf b/update_manifest.nf deleted file mode 100644 index 8d60883..0000000 --- a/update_manifest.nf +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env nextflow - -/* - Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences) - are present in a microbial community. - - This utility replaces the /manifest table in an existing geneshot results output file. -*/ - -// Using DSL-2 -nextflow.enable.dsl=2 - -// Parameters -params.input_hdf = false -params.output_folder = false -params.output_prefix = false -params.manifest = false -params.help = false - -// Docker containers -container__pandas = "quay.io/fhcrc-microbiome/python-pandas:v1.0.3" - -// Function which prints help message text -def helpMessage() { - log.info""" - Replace the /manifest in an existing geneshot output file. - - Usage: - - nextflow run Golob-Minot/geneshot/update_manifest.nf - - Options: - --manifest Manifest (CSV) to be used in the updated geneshot output file - --input_hdf Geneshot results HDF5 file to be used for analysis - --output_folder Folder to place output tile - --output_prefix Text used as a prefix for summary HDF5 output files (the .corncob.hdf5 suffix will be attached) - -w Working directory. Defaults to `./work` - - """.stripIndent() -} - -// Show help message if the user specifies the --help flag at runtime -if (params.help || params.input_hdf == false || params.manifest == false || params.output_folder == false || params.output_prefix == false){ - // Invoke the function above which prints the help message - helpMessage() - // Exit out and do not run anything else - exit 0 -} - -// Import the process used to repack the output HDF5 file -include repackHDF from './modules/general' - -// Process to update the formula listed in the summary table -// Also extract the manifest to reduce the number of times -// the file is opened -process replaceManifest { - container "${container__pandas}" - label 'mem_medium' - errorStrategy 'finish' - - input: - path "input.hdf5" - path manifest_csv - - output: - path "${params.output_prefix}.results.hdf5" - -""" -#!/usr/bin/env python3 - -import pandas as pd -import shutil -import pickle -pickle.HIGHEST_PROTOCOL = 4 - -# Read in the new manifest -new_manifest = pd.read_csv("${manifest_csv}", sep=",") - -# Make sure that the new manifest has the three required columns -for k in ["specimen", "R1", "R2"]: - assert k in new_manifest.columns.values, "New manifest must contain a %s column" % k - -# Remove the read path columns, if present -for k in ["R1", "R2", "I1", "I2"]: - if k in new_manifest.columns.values: - new_manifest = new_manifest.drop(columns=k) - -# Assign the new name for the output file -new_file_name = "${params.output_prefix}.results.hdf5" - -# Copy the input file to a new output file object -print("Copying input.hdf5 to %s" % new_file_name) -shutil.copyfile("input.hdf5", new_file_name) - -# Open a connection to the store -with pd.HDFStore(new_file_name, "a") as store: - - # Read in the existing manifest - old_manifest = pd.read_hdf(store, "/manifest") - - # Make sure that the new and old manifest have the same set of specimens - old_specimen_set = set(old_manifest["specimen"].tolist()) - new_specimen_set = set(new_manifest["specimen"].tolist()) - - for s, msg in [ - (old_specimen_set - new_specimen_set, "Specimens missing from old specimen list"), - (new_specimen_set - old_specimen_set, "Specimens missing from new specimen list"), - ]: - assert len(s) == 0, "%s: %s" % (msg, ", ".join(list(s))) - - # Write new manifest to the store - new_manifest.to_hdf(store, "/manifest") - -print("Done") -""" - -} - -workflow { - - // Update the manifest - replaceManifest( - file(params.input_hdf), - file(params.manifest) - ) - - // Repack the HDF - repackHDF( - replaceManifest.out - ) - - // Publish output files - publish: - repackHDF.out to: "${params.output_folder}", mode: "copy", overwrite: true - -} \ No newline at end of file