gene_abund.nf

#!/usr/bin/env nextflow

/*
  Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences)
  are present in a microbial community.

  This utility extracts the proportion of gene copies from each specimen which are
  annotated with a given function by the eggNOG-mapper functional annotation tool.

  To use this utility, provide a set of previously-generated geneshot results to query,
  as well as a string which will be used to select all eggNOG annotations which contain it.
*/

// Using DSL-2
nextflow.preview.dsl=2

// Parameters
params.results_hdf = false
params.details_hdf = false
params.genes_fasta = false
params.output_folder = false
params.output_prefix = false
params.query = false
params.help = false

// Function which prints help message text
def helpMessage() {
    log.info"""
    This utility extracts the proportion of gene copies from each specimen which are
    annotated with a given function by the eggNOG-mapper functional annotation tool.

    To use this utility, provide a set of previously-generated geneshot results to query,
    as well as a string which will be used to select all eggNOG annotations which contain it.

    Usage:

    nextflow run Golob-Minot/geneshot/gene_abund.nf <ARGUMENTS>
    
    Options:
      --results_hdf         Location for results.hdf5 generated by geneshot
      --details_hdf         Location for details.hdf5 generated by geneshot
      --genes_fasta         Location for input 'genes.fasta.gz'
      --output_folder       Location for output files
      --output_prefix       Prefix for output files
      --query               Query string to use to subset eggNOG gene descriptions
    
    """.stripIndent()
}

// Show help message if the user specifies the --help flag at runtime
if (params.help || params.results_hdf == false || params.details_hdf == false || params.genes_fasta == false || params.output_prefix == false || params.output_folder == false || params.query == false){
    // Invoke the function above which prints the help message
    helpMessage()
    // Exit out and do not run anything else
    exit 0
}

workflow {

    // Make sure we can find the input files
    if(file(params.results_hdf).isEmpty()){
        log.info"""Cannot find input file ${params.results_hdf}""".stripIndent()
        exit 0
    }
    if(file(params.details_hdf).isEmpty()){
        log.info"""Cannot find input file ${params.details_hdf}""".stripIndent()
        exit 0
    }
    if(file(params.genes_fasta).isEmpty()){
        log.info"""Cannot find input file ${params.genes_fasta}""".stripIndent()
        exit 0
    }

    // Get the table of genes which contain this annotation
    extractAnnotations(
        file(params.results_hdf)
    )

    // Get the sequences of the genes which match this query
    extractFASTA(
        extractAnnotations.out,
        file(params.genes_fasta)
    )

    // Get the user-provided manifest
    extractManifest(
        file(params.results_hdf)
    )

    // Get the proportion of gene copies for these genes across all specimens
    extractAbund(
        extractAnnotations.out,
        file(params.details_hdf),
        extractManifest.out
    )

}

process extractAnnotations {

    container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5"
    label "mem_medium"

    input:
    file results_hdf

    output:
    file "${params.output_prefix}.genes.csv"

    publishDir "${params.output_folder}", mode: 'copy', overwrite: true

"""#!/usr/bin/env python3

import os
import pandas as pd

# Set the input path
results_hdf = '${results_hdf}'

# Make sure that the file is present in the working folder
assert os.path.exists(results_hdf)

# Set up a function to filter a DataFrame by the query string
query_str = '${params.query}'
def filter_df(df):
    return df.loc[
        df['eggNOG_desc'].fillna('').apply(lambda s: query_str in s)
    ]

# Read in the table in chunks and filter as we go
df = pd.concat([
    filter_df(chunk_df)
    for chunk_df in pd.read_hdf(
        results_hdf, 
        '/annot/gene/all', 
        iterator=True
    )
])
print("Number of genes containing the query '%s': %d" % (query_str, df.shape[0]))

# If there are any genes matching this string
if df.shape[0] > 0:

    # Write out the smaller table
    df.to_csv("${params.output_prefix}.genes.csv", index=None)

    print("Done")

else:

    print("NO GENES FOUND MATCHING THE QUERY: %s" % query_str)

"""

}

process extractManifest {

    container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5"
    label "mem_medium"

    input:
    file results_hdf

    output:
    file "${params.output_prefix}.manifest.csv"

    publishDir "${params.output_folder}", mode: 'copy', overwrite: true

"""#!/usr/bin/env python3

import os
import pandas as pd

# Set the input path
results_hdf = '${results_hdf}'

# Make sure that the file is present in the working folder
assert os.path.exists(results_hdf)

# Read the manifest
manifest_df = pd.read_hdf(results_hdf, "/manifest")

# Write out to a file
manifest_df.to_csv("${params.output_prefix}.manifest.csv", index=None)

"""

}

process extractFASTA {

    container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5"
    label "io_limited"

    input:
    file gene_csv
    file gene_fasta_gz

    output:
    file "${params.output_prefix}.genes.fasta.gz"

    publishDir "${params.output_folder}", mode: 'copy', overwrite: true

"""#!/usr/bin/env python3

from Bio.SeqIO.FastaIO import SimpleFastaParser
import gzip
import os
import pandas as pd

# Set the input paths
# Make sure that the files are present in the working folder
gene_csv = '${gene_csv}'
assert os.path.exists(gene_csv)
gene_fasta_gz = '${gene_fasta_gz}'
assert os.path.exists(gene_fasta_gz)
output_fasta_gz = '${params.output_prefix}.genes.fasta.gz'
assert not os.path.exists(output_fasta_gz)

# Read in the table
df = pd.read_csv(gene_csv)
print("Read in annotations for %d genes" % df.shape[0])

# Get the list of genes
gene_names = set(df['gene'].tolist())

# Keep a counter of how many genes we've found
n_found = 0

# Open the input and output files
with gzip.open(gene_fasta_gz, 'rt') as handle_in, gzip.open(output_fasta_gz, 'wt') as handle_out:

    # Iterate over the inputs
    for gene_name, gene_seq in SimpleFastaParser(handle_in):

        # If this is one of the genes we are looking for
        if gene_name in gene_names:

            # Write it out
            handle_out.write(">%s\\n%s\\n" % (gene_name, gene_seq))

            # Increment the counter
            n_found += 1

# Report the number of genes which were found
print("Wrote out %d gene sequences" % n_found)
print("DONE")

"""

}

process extractAbund {

    container "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.5"
    label "mem_medium"

    input:
    file gene_csv
    file details_hdf
    file manifest_csv

    output:
    file "${params.output_prefix}.*.csv.gz"

    publishDir "${params.output_folder}", mode: 'copy', overwrite: true

"""#!/usr/bin/env python3

import os
import pandas as pd

# Set the input paths
# Make sure that the files are present in the working folder
gene_csv = '${gene_csv}'
assert os.path.exists(gene_csv)
details_hdf = '${details_hdf}'
assert os.path.exists(details_hdf)
manifest_csv = '${manifest_csv}'
assert os.path.exists(manifest_csv)

# Read in the table of gene annotations
df = pd.read_csv(gene_csv)
print("Read in annotations for %d genes" % df.shape[0])

# Get the list of genes
gene_names = set(df['gene'].tolist())

# Read in the manifest
manifest_df = pd.read_csv(manifest_csv)

# Keep the complete set of gene abundances in long format
output = []

# Open a connection to the HDF store
with pd.HDFStore(details_hdf, 'r') as store:

    # Set up an object to save the proportion of gene copies in each specimen
    for specimen_name in manifest_df['specimen'].unique():

        print("Reading in abundances for specimen '%s'" % specimen_name)

        # Read in the full table
        specimen_df = pd.read_hdf(store, "/abund/gene/long/%s" % specimen_name)

        # Get the total depth for all gene copies
        tot = specimen_df['depth'].sum()

        # Subset to the genes of interest, add the specimen, add the proportion of gene copies, and append to the output
        output.append(
            specimen_df.loc[
                specimen_df['id'].isin(gene_names)
            ].assign(
                specimen = specimen_name,
                prop = lambda d: d['depth'] / tot
            )
        )

# Combine all of the output
print("Combining all outputs")
output = pd.concat(
    output
).reset_index(
    drop=True
)

# Add the eggNOG annotation
print("Adding eggNOG names")
output = output.assign(
    eggNOG_desc = output['id'].apply(
        df.set_index('gene')['eggNOG_desc'].get
    )
)

# Save the long output
print("Saving long output")
output.to_csv(
    "${params.output_prefix}.long.csv.gz",
    index=None,
    compression='gzip'
)

# Save the wide output
output.pivot_table(
    index="specimen",
    columns="eggNOG_desc",
    values="prop",
    aggfunc=sum
).fillna(
    0
).reset_index(
).to_csv(
    "${params.output_prefix}.wide.csv.gz",
    index=None,
    compression='gzip'
)

print("DONE")

"""

}