From 47c75d8d6737d857f8cf27756e5c11f0c45236c3 Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Fri, 27 May 2016 12:35:31 -0400 Subject: [PATCH] Clean-up (#14) * Clean-up * fixed tab in output.py * removed old code/ folder * more cleanup --- .gitignore | 4 - README.md | 43 ++-- code/DataImport.py | 130 ---------- code/DeprecatedOutputs.py | 268 -------------------- code/DeprecatedPhylo.py | 453 ---------------------------------- code/Deprecatedget_outputs.py | 23 -- code/FixGbk.py | 78 ------ code/KvCircos.py | 177 ------------- code/KvDataStructures.py | 168 ------------- code/KvasirBlast.py | 229 ----------------- code/KvasirHGT.py | 201 --------------- code/KvasirPhylo.py | 148 ----------- code/reset_db.py | 10 - code/run_kvasir.py | 56 ----- requirements.txt | 2 +- run.py | 37 +-- settings.py | 2 +- src/Analysis/output.py | 10 +- test.txt | 0 19 files changed, 44 insertions(+), 1995 deletions(-) delete mode 100644 code/DataImport.py delete mode 100644 code/DeprecatedOutputs.py delete mode 100644 code/DeprecatedPhylo.py delete mode 100644 code/Deprecatedget_outputs.py delete mode 100644 code/FixGbk.py delete mode 100644 code/KvCircos.py delete mode 100644 code/KvDataStructures.py delete mode 100644 code/KvasirBlast.py delete mode 100644 code/KvasirHGT.py delete mode 100644 code/KvasirPhylo.py delete mode 100644 code/reset_db.py delete mode 100644 code/run_kvasir.py delete mode 100644 test.txt diff --git a/.gitignore b/.gitignore index faeeb03..5f82b27 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,3 @@ .DS_store *ipynb* -*.sublime-project -*.sublime-workspace *.pyc -.idea -user_settings.py \ No newline at end of file diff --git a/README.md b/README.md index df00e81..14dc4e2 100644 --- a/README.md +++ b/README.md @@ -7,36 +7,29 @@ Dependencies: * MongoDB * pymongo * BioPython -* ~~Scikit Bio~~ -* ~~Pandas~~ +* BLAST+ CLI **Identification of horizontal gene transfer between sequenced microbial genomes** -##The following is out of date... Will get back to this soon +Kvasir takes as the input a folder containing genomes in genbank format. The protein coding genes from these genomes are loaded into a database, and blasted against each other. -##~~Running Kvasir~~ -~~With dependencies installed, fire up a Mongod instance. In the terminal:~~ +### Usage -~~`mongod --dbpath path/to/db`~~ +Change the values in `settings.py` to point at your input folder, output folder, and the name you want for your database. -~~Run Kvasir by invoking run_kvasir.py in your working directory:~~ +Launch a local `mongod` instance: +``` +$ mongod --dbpath path/to/db +``` -~~`python run_kvasir.py /path/to/gb_files name_of_mongoDB`~~ +Run functions in `run.py`. Eventually, this will get more streamlined, for now... -####~~DataImport~~: -* ~~Imports genbank-formated annotated genomes into Mongo database.~~ -* ~~.gb files require "locus_tag" feature. If your genomes don't have it, FixGbk.py shoul take care of it for you~~ -* ~~Mongo database has "collections" and "documents" - a different collection is generated for each species (each separate genbank file) and documents representing each CDS. ~~ - * ~~CDS documents are like python dictionaries, and contain entries for species, DNA and amino acid protein sequences, contig and location info, and annotation information.~~ - * ~~each document is assigned a uniqe `_id` attribute within the species, so every gene is uniquely identified by a `(species, _id)` tuple~~ - -####~~MakeBlastDb~~ -~~Generates a multi-fasta file containing every gene in the mongo database, generates a BLASTable database, then deletes the temprorary file~~ - -####~~KsavirBlast~~ -* ~~For each species, generates a temporary fasta file and BLASTs against every other gene in the database~~ -* ~~BLAST generates and xml document, which is parsed for unique hits~~ -* ~~new "hits" entry is added to each gene document in MongoDB, which contains a list of `(species, _id)` tuples for each hit (these are used in the next script to gather info about hits)~~ - -####~~Outputs~~ -~~Still a work in progress. So far, have a bunch of output formates working... will detail later.~~ +``` +Python 2.7.11 (default, Dec 14 2015, 10:44:13) +[GCC 4.2.1 Compatible Apple LLVM 7.0.2 (clang-700.1.81)] on darwin +Type "help", "copyright", "credits" or "license" for more information. +>>> import run +>>> run.import_data() +>>> run.blast_db() +>>> run.analyze(0.99) +``` diff --git a/code/DataImport.py b/code/DataImport.py deleted file mode 100644 index 6356ba9..0000000 --- a/code/DataImport.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -''' -This script is designed to import data from a genbank file and -write it to a mongoDB database. Must have Mongod running, in terminal: -`mongod --dbpath path/to/db`. -''' - -import re, os -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Alphabet import IUPAC -import KvDataStructures as kv - -def import_file(some_genbank, collection): - """ - Import records from `some_genbank` into Mongo `collection` - * Imports each coding sequence (CDS) as document of {'type':'gene'} - * Imports up to one 16S rRNA sequences as document of {'type':'16S'} - * Each document has info on species, contig and location, DNA sequence and (for CDS) amino acid sequence - * Each gene in genbank file MUST have `locus_tag` feature. If it doesn't, use `add_locus_tags()` - * Note - `add_locus_tags()` doesn't exist yet, will be similar to `FixGbk.validate_gbk()` - """ - with open(some_genbank, 'r') as open_file: - collection = kv.get_collection(collection) - - # Each "record" in genbank file is read, corresponds to individual contigs - for record in SeqIO.parse(open_file, 'gb'): - current_contig = record.name - try: - current_species = record.annotations['source'] - except KeyError: - name = re.search(r'\w+\/(.+)\.\w+$', some_genbank) - current_species = name.group(1) - - - collection.insert_one({ - 'species':current_species, - 'contig':current_contig, - 'dna_seq':str(record.seq), - 'type':'contig' - }) - - print "Importing {}".format(current_contig) - ssu_gene = get_16S(record) - if ssu_gene: - try: - locus_tag = ssu_gene[0].qualifiers['locus_tag'][0] - except KeyError: - locus_tag = None - - parsed_location = kv.get_gene_location(ssu_gene[0].location) - gene_record = { - 'species':current_species, - 'location':{ - 'contig':current_contig, - 'start':parsed_location[0], - 'end':parsed_location[1], - 'strand':parsed_location[2], - }, - 'locus_tag':locus_tag, - 'annotation':ssu_gene[0].qualifiers['product'][0], - 'dna_seq':ssu_gene[1], - 'type':'16S' - } - print "adding 16S gene!" - collection.insert_one(gene_record) - kv.get_collection('16S').insert_one(gene_record) - - for feature in record.features: - if feature.type == 'CDS': - parsed_location = kv.get_gene_location(feature.location) - try: - locus_tag = feature.qualifiers['locus_tag'][0] - except KeyError: - locus_tag = None - - gene_record = { - 'species':current_species, - 'location':{ - 'contig':current_contig, - 'start':parsed_location[0], - 'end':parsed_location[1], - 'strand':parsed_location[2], - 'index':None - }, - 'locus_tag':locus_tag, - 'annotation':feature.qualifiers['product'][0], - 'dna_seq':get_dna_seq(feature, record), - 'aa_seq':feature.qualifiers['translation'][0], - 'type':'gene' - } - collection.insert_one(gene_record) - -def get_dna_seq(feature, record): - if feature.location.strand == 1: - dna_seq = record.seq[ - feature.location.start:feature.location.end - ] - elif feature.location.strand == -1: - dna_seq = record.seq[ - feature.location.start:feature.location.end - ].reverse_complement() - return str(dna_seq) - -def get_16S(gbk_record): - for feature in gbk_record.features: - if feature.type == 'rRNA': - if re.search(r"16[sS]|ssu|SSU", str(feature.qualifiers['product'])): - if len(feature) > 1000: - return (feature, get_dna_seq(feature, gbk_record)) - else: - return None - -# Need to fix search! Only returns "contig"... -def get_contig(record_name): - import re - parse_contig = re.search(r'kvc_(\d\d\d)', record_name) - return parse_contig.group(1) - -def import_folder(genbank_folder): - import os - print 'we\'re importing a folder now!' - for a_file in os.listdir(genbank_folder): - current_file = os.path.join(genbank_folder, a_file) - print 'importing {0}!'.format(current_file) - import_file(current_file) diff --git a/code/DeprecatedOutputs.py b/code/DeprecatedOutputs.py deleted file mode 100644 index 7966101..0000000 --- a/code/DeprecatedOutputs.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python 2.7 -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import os -from itertools import combinations, combinations_with_replacement -from bson.objectid import ObjectId -import pandas as pd -import KvDataStructures as kv -from skbio import DNA -from skbio import DistanceMatrix -from skbio import tree -from skbio.alignment import StripedSmithWaterman -from matplotlib import pyplot as plt -import numpy as np - - -def output_hits_csv(): - hits = kv.get_collection('hits') - if not os.path.isdir('hits/'): - os.makedirs('hits/') - - df_index = [ - 'parent_locus', - 'parent_annotation', - 'parent_seq', - 'parent_contig', - 'parent_start', - 'parent_end', - 'parent_strand', - 'hit_tag', - 'hit_annotation', - 'hit_seq', - 'hit_contig', - 'hit_start', - 'hit_end', - 'hit_strand', - ] - - for record in hits.find(): - query_species = record['species'] - df = pd.DataFrame() - - for query_id in record['hits']: - list_of_hits = record['hits'][query_id] - if list_of_hits: - query_record = kv.get_collection('core').find_one({'species':query_species, '_id':ObjectId(query_id)}) - - for hit in list_of_hits: - - hit_species = hit[0] - hit_id = hit[1] - hit_record = kv.get_collection('core').find_one({'species':hit_species, '_id':ObjectId(hit_id)}) - hit_record['kvtag'] - query_annotation = query_record['annotation'].replace(',','') - hit_annotation = hit_record['annotation'].replace(',','') - - series = pd.Series( - [query_record['kvtag'], - query_annotation, - query_record['dna_seq'], - query_record['location']['contig'], - query_record['location']['start'], - query_record['location']['end'], - query_record['location']['strand'], - hit_record['kvtag'], - hit_annotation, - hit_record['dna_seq'], - hit_record['location']['contig'], - hit_record['location']['start'], - hit_record['location']['end'], - hit_record['location']['strand'], - ], - index=df_index, - name=hit_record['species'] - ) - - df=df.append(series) - df.to_csv('hits/{}_hits.csv'.format(query_species), columns=df_index) - -def output_one_fasta(mongo_record, out_file='output.fna'): - with open(out_file, 'w+') as output_handle: - output_handle.write( - '>{0}|{1}\n{2}\n'.format( - mongo_record['species'], - str(mongo_record['_id']), - mongo_record['dna_seq'], - ) - ) - -def get_groups(): - all_hits = kv.get_collection('hits') - groups_list = [] - for h in all_hits.find(): - current_species = h['species'] - - # if any([x in current_species for x in dutton_list] or [y in current_species for y in wolfe_list]): - current_species_islands = get_islands(h['species']) - - # each sublist represents one island... - for island in current_species_islands: - hit_set = set() # container for hits - for gene_id in island: - gene_hits = h['hits'][gene_id[1]] - - # Pulls each hit id tuple, then appends it to group_set - for hit in gene_hits: - hit_set.add((hit[0], hit[1])) - # add id tuples for hits to island list... - island.update(hit_set) - # And add new island (with multiple species) to groups_list - groups_list.append(list(island)) - - # Since each species' islands are built independently, there's a lot of redundancy - # So... Collapse lists that contain shared elements and deduplicate - return map(list, collapse_lists(groups_list)) - -def output_groups(output_file='default', min_group_size=2): - if output_file == 'default': - output_file = 'groups.csv'.format(kv.db.name) - df_index = [ - 'groups', - 'species', - 'kvtag', - 'contig', - 'start', - 'end', - 'strand', - 'annotation', - 'dna_seq', - ] - df = pd.DataFrame() - group_no= 0 - groups_list = get_groups() - groups_list.sort(key=len, reverse=True) - - for group in groups_list: - if len(group) >= min_group_size: - group_no += 1 - # Entry is `(species, id)` - for entry in group: - db_handle = kv.get_mongo_record(*entry) - - annotation = db_handle['annotation'].replace(',','') - series = pd.Series( - [str(group_no).zfill(3), - db_handle['species'], - db_handle['kvtag'], - db_handle['location']['contig'], - db_handle['location']['start'], - db_handle['location']['end'], - db_handle['location']['strand'], - annotation, - db_handle['dna_seq'] - ], - index=df_index, - name=db_handle['kvtag'] - ) - df=df.append(series) - df.to_csv(output_file, columns=df_index) - -def output_groups_by_species(min_group_size=2): - all_species = kv.get_species_collections() - groups_list = get_groups() - groups_list.sort(key=len, reverse=True) - - groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=[str(x+1) for x in range(0, len(groups_list))]) - - group_no = 0 - for group in groups_list: - if len(group) >= min_group_size: - group_no += 1 - species_in_group = [x[0] for x in group] - for species in species_in_group: - groups_df[species][group_no-1] = 1 - groups_df.to_csv('groups_by_species.csv') - -def output_compare_matrix(): - groups = get_groups() - all_species = get_tree()[1] - print all_species - - cds_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species) - nt_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species) - groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species) - - for pair in combinations(all_species, 2): - print "====\nComparing {} and {}".format(pair[0], pair[1]) - shared_cds, shared_nt = pair_compare(pair[0],pair[1]) - shared_groups = 0 - if kv.get_genus(pair[0]) == kv.get_genus(pair[1]): - print "Oops! They're the same genus... moving on\n====" - continue - elif shared_cds == 0: - print "Oops! Looks like they don't share anything... moving on\n====" - continue - else: - for group in get_groups(): - if [x for x in group if x[0] == pair[0] and any(y[0] == pair[1] for y in group)]: - shared_groups +=1 - - cds_df[pair[0]][pair[1]] = cds_df[pair[1]][pair[0]] = shared_cds - nt_df[pair[0]][pair[1]] = nt_df[pair[1]][pair[0]] = shared_nt - groups_df[pair[0]][pair[1]] = groups_df[pair[1]][pair[0]] = shared_groups - print "shared cds: {}\nshared nt: {}\nshared groups: {}\n====".format(shared_cds, shared_nt, shared_groups) - - print nt_df - - cds_df.to_csv('cds_matrix.csv'.format(kv.db.name)) - nt_df.to_csv('nt_matrix.csv'.format(kv.db.name)) - groups_df.to_csv('groups_matrix.csv' .format(kv.db.name)) - -def pair_compare(species_1, species_2): - shared_CDS = 0 - shared_nt = 0 - - s1_genes = kv.get_collection('hits').find_one({'species':species_1}) - - for gene in s1_genes['hits']: - if s1_genes['hits'][gene]: - for hit in s1_genes['hits'][gene]: - if hit[0] == species_2: - shared_CDS += 1 - species_2_record = kv.get_mongo_record(hit[0],hit[1]) - hit_loc = species_2_record['location'] - shared_nt += hit_loc['end'] - hit_loc['start'] - return shared_CDS, shared_nt - -def get_islands(species_name): - islands = [] - species_hits_list = [] - - # Add mongo_record for each hit in any gene - all_hits = kv.get_collection('hits') - - - for query_id in species_hits: - if species_hits[query_id]: - species_hits_list.append( - kv.get_mongo_record(species_name, query_id) - ) - - for entry_1 in species_hits_list: - entry_recorded = False - for entry_2 in species_hits_list: - if entry_1 == entry_2: - pass - elif entry_1['location']['contig'] != entry_2['location']['contig']: - pass - else: - location_1 = entry_1['location'] - location_2 = entry_2['location'] - if abs(location_1['end'] - location_2['start']) <= 5000: - entry_recorded = True - islands.append([ - (entry_1['species'], str(entry_1['_id'])), - (entry_2['species'], str(entry_2['_id'])) - ]) - if not entry_recorded: - islands.append([(entry_1['species'], str(entry_1['_id']))]) - - return collapse_lists(islands) - -if __name__ == '__main__': - kv.mongo_init('reorg') - os.chdir('/Users/KBLaptop/computation/kvasir/data/output/reorg/') - get_tree(True) \ No newline at end of file diff --git a/code/DeprecatedPhylo.py b/code/DeprecatedPhylo.py deleted file mode 100644 index da568e7..0000000 --- a/code/DeprecatedPhylo.py +++ /dev/null @@ -1,453 +0,0 @@ -# !/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import re -import os -import KvDataStructures as kv -import KvasirHGT as kh -from itertools import combinations, permutations -from matplotlib import pyplot as plt -from Bio import SeqIO -from Bio.Blast import NCBIXML -from bson.objectid import ObjectId -from subprocess import Popen, PIPE -import numpy as np -import pymongo -import plotly.plotly as py - -def get_16S_distance(species_1, species_2, pident=False): - # print 'Aligning:', species_1, species_2 - s1_ssu = str(kv.db['16S'].find_one({'species':species_1})['dna_seq']) - s2_ssu = str(kv.db['16S'].find_one({'species':species_2})['dna_seq']) - return get_gene_distance(s1_ssu, s2_ssu) - -def output_all_16S(): - print "Making fasta of all 16S in database {}".format(kv.db.name) - with open('{}_16S.fna'.format(kv.db.name), 'w+') as output_handle: - for record in kv.get_collection('16S').find(): - output_handle.write( - '>{0}\n{1}\n'.format( - record['species'], - record['dna_seq'], - ) - ) - -def get_gene_distance(seq_1, seq_2): - query = StripedSmithWaterman(seq_1) - alignment = query(seq_2) - q = DNA(alignment.aligned_query_sequence) - t = DNA(alignment.aligned_target_sequence) - return q.distance(t) - -def output_distance_matrix(core=False, to_file=True): - all_species = kv.get_collection('core').distinct('species')) - if core: - pass - else: - all_species.extend(kv.get_collection('other').distinct('species')) - - distance_matrix = pd.DataFrame(data={n:0.0 for n in all_species}, index=all_species) - - for pair in combinations_with_replacement(all_species, 2): - distance = get_16S_distance(pair[0], pair[1]) - distance_matrix[pair[0]][pair[1]] = distance - distance_matrix[pair[1]][pair[0]] = distance - - if to_file: - distance_matrix.to_csv('distance_matrix.csv') - else: - return distance_matrix - -def get_tree(core=False, newick=False): - all_species = kv.get_collection('core').distinct('species') - if core: - pass - else: - all_species.extend(kv.get_collection('other').distinct('species')) - t = tree.nj(dm) - print t.ascii_art() - tips = [] - for node in t.tips(): - print node.name, node.length - tips.append(node.name.replace(' ', '_')) - if newick: - n = tree.nj(dm, result_constructor=str) - print n - else: - return (t, tips) - -def ssu_perc_id(species_1, species_2): - s1_ssu = kv.db['16S'].find_one({'species':species_1}) - s2_ssu = kv.db['16S'].find_one({'species':species_2}) - - s1_fasta = make_gene_fasta(s1_ssu) - s2_fasta = make_gene_fasta(s2_ssu) - - out = Popen( - ['blastn', - '-query', s1_fasta, - '-subject', s2_fasta, - '-outfmt', '10 pident', - ], stdout=PIPE - ).communicate()[0] - if out: - result = re.split(r'[,\n]', out.rstrip())[0] - return result - -def make_species_fasta(species): - if not os.path.isdir('fastas'): - os.makedirs('fastas') - fasta = 'fastas/{}.fna'.format(species) - if not os.path.isfile(fasta): - with open(fasta, 'w+') as output_handle: - for record in kv.get_collection(species).find(): - output_handle.write( - ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq']) - ) - return fasta - -def make_gene_fasta(gene_record): - if not os.path.isdir('fastas/genes'): - os.makedirs('fastas/genes') - fasta = 'fastas/genes/{}_{}.fna'.format(gene_record['species'], gene_record['kvtag']) - if not os.path.isfile(fasta): - with open(fasta, 'w+') as output_handle: - output_handle.write( - ">{}|{}\n{}\n".format(gene_record['species'].replace(' ', '_'), gene_record['_id'], gene_record['dna_seq']) - ) - return fasta - -def make_indexed_fasta(species): - if not os.path.isdir('fastas/'): - os.makedirs('fastas/') - id_list =[] - indexed_species = kv.index_contigs(kv.get_collection(species)) - indexed_species2 = kv.index_contigs(kv.get_collection(species)) - fasta = 'fastas/{}_indexed.fna'.format(species) - for record in indexed_species: - id_list.append(record['_id']) - if not os.path.isfile(fasta): - with open(fasta, 'w+') as output_handle: - for record in indexed_species2: - output_handle.write( - ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq']) - ) - - return (fasta, id_list) - -def fasta_blast(species_1, species_2, word_size=28): - if not os.path.isdir('pairwise_blast'): - os.makedirs('pairwise_blast') - - s1 = make_species_fasta(species_1) - s2 = make_species_fasta(species_2) - results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2) - - if not os.path.isfile('pairwise_blast/{}_blastdb.nhr'.format(species_2)): - Popen( - ['makeblastdb', - '-in', s2, - '-dbtype', 'nucl', - '-out', 'pairwise_blast/{}_blastdb'.format(species_2), - '-title', os.path.basename(species_2), - ] - ).wait() - - if word_size > 7: - out = Popen( - ['blastn', - '-query', s1, - '-db', 'pairwise_blast/{}_blastdb'.format(species_2), - '-outfmt', '5', - '-word_size', str(word_size), - '-out', results, - ], - ).communicate()[0] - else: - print "Word size is too small, skipping" - return False - - blast_hits = count_records(results) - print "Got {} hits".format(blast_hits) - if blast_hits < 800: - if word_size > 7: - print "Not enough hits, trying with word size = {}".format(word_size-2) - os.remove(results) - return fasta_blast(species_1, species_2, word_size-2) - else: - print "Still not enough hits, but we're giving up trying to get to 800" - return results - elif blast_hits >= 800: - print "Got enough hits!" - return results - else: - print "something went wrong" - os.remove(results) - return False - -def output_loc_hist(species_1, species_2, ax): - s1, id_list = make_indexed_fasta(species_1) - s2 = make_species_fasta(species_2) - if not os.path.isfile('pairwise_blast/{}_blastdb.nhr'.format(species_2)): - Popen( - ['makeblastdb', - '-in', s2, - '-dbtype', 'nucl', - '-out', 'pairwise_blast/{}_blastdb'.format(species_2), - '-title', os.path.basename(species_2), - ] - ).wait() - - indexed_blast = blast_one(s1, 'pairwise_blast/{}_blastdb'.format(species_2)) - concatenated_subject = kv.concat_contigs(kv.get_collection(species_1)) - - xys = [] - last_end = 0 - - for i in range(len(indexed_blast))[0::4]: - # print indexed_blast[i:i+4] - - subject = concatenated_subject[ObjectId(kv.fasta_id_parse(indexed_blast[i])[1])] - query = kv.get_mongo_record(*kv.fasta_id_parse(indexed_blast[i+1])) - - x1 = subject['location']['start'] - if x1 <= last_end: - x1 = last_end + 1 - - x2 = subject['location']['end'] - last_end = x2 - - y = float(indexed_blast[i+2]) - print [(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)] - xys.extend([(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)]) - - xys.sort(key=lambda xy:xy[0]) - x, y = zip(*xys) - - if len(x) > 200: - cmap = plt.get_cmap('jet') - color = cmap(np.random.rand()) - - ax.plot(x, y, marker=None, color=color, linestyle='-', label='{}'.format(species_2)) - print "Plotting {} vs {}!".format(species_1, species_2) - - ssu_id = ssu_perc_id(species_1, species_2) - xmin, xmax = plt.xlim() - ax.plot([xmin, xmax], [ssu_id, ssu_id], color=color, label='{} 16S'.format(species_2)) - - else: - print "Not that many hits between {} and {}, skipping!".format(species_1, species_2) - - # plt.fill_between(x, 0, y, color='#B0384D', alpha=0.5) - -def blast_one(query, database, word_size=28): - out, err = Popen( - ['blastn', - '-query', query, - '-db', database, - '-num_alignments', '1', - '-outfmt', '10 qseqid sseqid pident length', - '-word_size', str(word_size), - ], stdout=PIPE - ).communicate() - if out: - result = re.split(r'[,\n]', out.rstrip()) - return result - else: - return None - -def all_by_all(species_1, species_2): - # results = fasta_blast(species_1, species_2) - results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2) - if results: - with open(results, 'r') as result_handle: - blast_records = NCBIXML.parse(result_handle) - hits_list = [] - for blast_record in blast_records: - qsp, qid = kv.fasta_id_parse(blast_record.query) - query_record = kv.get_mongo_record(qsp, qid) - for alignment in blast_record.alignments: - asp, aid = kv.fasta_id_parse(alignment.hit_def) - alignment_record = kv.get_mongo_record(asp, aid) - for hsp in alignment.hsps: - if hsp.align_length > 100: - pident = float(hsp.positives)/float(hsp.align_length) - length = hsp.align_length - hits_list.append((query_record, alignment_record)) - break - break - return hits_list - else: - print "Blast didn't work for some reason" - -def global_distance(species_1, species_2): - hits_list = all_by_all(species_1, species_2) - hits_distance = [] - ssu_distance = None - for hit in hits_list: - distance = kh.get_gene_distance(str(hit[0]['dna_seq']), str(hit[1]['dna_seq'])) - hits_distance.append(distance) - if hit[0]['type'] == '16S': - ssu_distance = distance - - return (ssu_distance, hits_distance) - -def plot_global_distance(list_of_species, sp_index): - focus = list_of_species[sp_index] - list_copy = list(list_of_species) - list_copy.remove(focus) - - ssus = [] - median_ys = [] - avg_ys = [] - plt.close() - - for sp in list_copy: - x_points = [] - y_points = [] - print sp - data = global_distance(focus, sp) - print data[0] - if data[0]: - for i in range(len(data[1])): - x_points.append(data[0]) - y_points.append(data[1][i]) - plt.scatter(x_points, y_points, marker='o', color='#B0384D') - ssus.append(data[0]) - median_ys.append(np.median(data[1])) - avg_ys.append(np.average(data[1])) - - plt.title(focus) - plt.plot([x for x in ssus], [y for y in ssus], marker='s', color='#599FA7') - plt.plot([x for x in ssus], [y for y in median_ys], '^m') - plt.xlabel("Species Distance") - plt.ylabel("Gene Distance") - plt.xlim(xmax=1.5, xmin=0) - plt.ylim(ymin=0) - - plt.savefig('{}_distances.pdf'.format(focus)) - -def pairwise_distance(species_1, species_2): - print "working on {} vs {}".format(species_1, species_2) - hits = all_by_all(species_1, species_2) - xy_points = [] - for hit in hits: - x = len(hit[0]['dna_seq']) - y = khget_gene_distance(str(hit[0]['dna_seq']), str(hit[1]['dna_seq'])) - xy_points.append((x,y)) - - - - x, y = zip(*xy_points) - y_average = np.average(y) - y_std = np.std(y) - plt.close() - plt.title("{} vs {}".format(species_1, species_2)) - plt.scatter(x, y, marker='o', color='#B0384D') - plt.axhline(y=y_average, color='#599FA7') - plt.axhline(y=y_average+y_std, color='#599FA7', ls=':') - plt.axhline(y=y_average-y_std, color='#599FA7', ls=':') - - plt.xlabel("Gene Length (bp)") - plt.ylabel("Gene Distance") - plt.xlim(xmin=0) - plt.ylim(ymin=0) - - plt.savefig('{}_{}_genedistance.pdf'.format(species_1, species_2)) - -def global_hist(list_of_species, sp_index): - focus = list_of_species[sp_index] - list_copy = list(list_of_species) - list_copy.remove(focus) - - plt.close() - - for sp in list_copy: - print sp - x_points = [] - data = global_distance(focus, sp) - print data[0] - if data[0] < 1.0: - data_list = [] - for i in range(len(data[1])): - data_list.append(data[1][i]) - x_points.append(data_list) - - plt.hist(x_points, histtype='bar', stacked=True) - - plt.title(focus) - plt.xlabel("Species Distance") - plt.ylabel("Gene Distance") - ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), - ncol=4, fancybox=True) - - plt.savefig('{}_hist.pdf'.format(focus)) - -def pairwise_hist(species_1, species_2): - print "working on {} vs {}".format(species_1, species_2) - hits = all_by_all(species_1, species_2) - xs = [] - for hit in hits: - x = khget_gene_distance(str(hit[0]['dna_seq']), str(hit[1]['dna_seq'])) - xs.append(x) - - plt.close() - plt.title("{} vs {}".format(species_1, species_2)) - plt.hist(xs, color='#B0384D',) - - plt.xlabel("Gene Distance") - plt.ylabel("frequency") - plt.xlim(xmin=0) - plt.ylim(ymin=0) - - plt.savefig('{}_{}_distance_hist.pdf'.format(species_1, species_2)) - -def count_records(blast_xml): - with open(blast_xml, 'r') as result_handle: - blast_records = NCBIXML.parse(result_handle) - counter = 0 - for i in blast_records: - try: - i.alignments[0] - counter +=1 - except IndexError: - pass - return counter - -def plot_many(list_of_species_pairs): - fig = plt.figure() - ypos = 1 - for pair in list_of_species_pairs: - if pair[0] == pairs[0][0]: - ax = fig.add_axes([1,ypos,1,1]) - output_loc_hist(pair[0], pair[1], ax) - - plot_url = py.plot_mpl(fig) - print plot_url - - # plt.xlabel("Position") - # plt.ylabel("percent identity") - # plt.savefig('/Users/KBLaptop/Desktop/try.pdf') - -if __name__ == '__main__': - kv.mongo_init('more_genomes') - os.chdir('/Users/KBLaptop/computation/kvasir/data/output/more_genomes/') - ls = kv.get_species_collections() - print ls - ls.remove('Arthrobacter_arilaitensis_Re117') - pairs = [] - for pair in combinations(ls, 2): - pairs.append((pair[0], pair[1])) - plot_many(pairs) - - - # if os.path.isfile('{}_{}.pdf'.format(pair[0], pair[1])): - # continue - # try: - # output_loc_hist(pair[0], pair[1]) - # except RuntimeError: - # print "Couldn't compare {} and {}".format(pair[0], pair[1]) - - # test() \ No newline at end of file diff --git a/code/Deprecatedget_outputs.py b/code/Deprecatedget_outputs.py deleted file mode 100644 index 3e52d34..0000000 --- a/code/Deprecatedget_outputs.py +++ /dev/null @@ -1,23 +0,0 @@ - #!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import os -import sys -import KvasirHGT as kh -from KvDataStructures import mongo_init - -def get_outputs(): - if not os.path.isdir('hgt_data/'): - os.makedirs('hgt_data/') - os.chdir('hgt_data/') - - kh.output_hits_csv() - kh.output_groups() - -# if __name__ == '__main__': -# import sys -# os.chdir('/Users/KBLaptop/computation/kvasir/data/output/dutton_genomes2/') -# mongo_init('dutton_genomes2') -# get_outputs() \ No newline at end of file diff --git a/code/FixGbk.py b/code/FixGbk.py deleted file mode 100644 index c5d44fc..0000000 --- a/code/FixGbk.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import os, re -from Bio import SeqIO - -def confirm_species_name(file_name): - user_validate = str(raw_input("Is {} the correct species name? [y/n]".format(file_name[:-3]))) - if user_validate == 'y': - return file_name[:-3].replace(' ', '_').replace('.', '') - elif user_validate == 'n': - print "Please use only letters for the following" - genus = raw_input("Please enter organism genus: ") - species = raw_input("Please enter organism species (use \"sp\" if unknown):") - print "Please use only letters, numbers and '_' for the following" - strain = raw_input("Please enter organism strain (use \"undef\" if unknown):") - if re.match(r'^[A-Za-z]+$', genus) and re.match(r'^[A-Za-z]+$', species) and re.match(r'^\w+$', strain): - return '{}_{}_{}'.format(genus, species, strain) - else: - "Something's wrong, try again" - confirm_species_name(file_name) - else: - print "Please enter \"y\" or \"n\"... try again" - confirm_species_name(file_name) - -def validate_gbk(some_genbank, user_validate=True): - file_name = os.path.basename(some_genbank) - sp_name = None - if user_validate: - sp_name = confirm_species_name(file_name) - else: - sp_name = file_name[:-3].replace(' ', '_').replace('.', '') - - if sp_name: - new_file_name = 'validated_gbk/{0}_validated.gb'.format(sp_name) - - if not os.path.isfile(new_file_name): - with open(some_genbank, 'r') as open_file: - - lt_counter = 0 - contig_counter = 0 - genome = SeqIO.parse(open_file, 'gb') - new_genome = [] - - for record in genome: - new_genome.append(record) - new_genome.sort(key=len, reverse=True) - - for record in new_genome: - contig_counter += 1 - try: - source = record.annotations['source'] - except KeyError: - record.annotations['source'] = 'None' - record.annotations['comment'] = """Validated gb file for use with Kvasir HGT finder www.github.com/kescobo/kvasir - Original Def: {} - Original source: {} - """.format(record.name, record.annotations['source']) - record.name = 'kvc_{0}'.format(str(contig_counter).zfill(3)) - record.annotations['source'] = sp_name - for feature in record.features: - lt_counter += 1 - feature.qualifiers['kvtag'] = str(lt_counter).zfill(5) - - if not os.path.isdir('validated_gbk/'): - os.makedirs('validated_gbk/') - with open(new_file_name, 'w+') as output_handle: - SeqIO.write(new_genome, output_handle, 'gb') - return os.path.abspath(new_file_name) - else: - print 'already fixed, skipping validation!' - return False - -if __name__ == '__main__': - os.chdir('/Users/KBLaptop/computation/tmp') - validate_gbk('/Users/KBLaptop/computation/genomes/arthro_IMG.gb', False) \ No newline at end of file diff --git a/code/KvCircos.py b/code/KvCircos.py deleted file mode 100644 index dd1e021..0000000 --- a/code/KvCircos.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import os -import re -from Bio import SeqIO, SeqUtils -from KvasirHGT import core_hgt_groups -import numpy as np -import KvDataStructures as kv -import pandas as pd - -def get_karyotype(some_gbk): - """ - Convert Genbank file into Karyotype file for Circos - - Each contig is a "chromosome" - - format: 'chr - ID LABEL START END COLOR' - """ - with open (some_gbk, 'r') as file_handle: - contigs = [] - sp_name = None - for record in SeqIO.parse(file_handle, 'gb'): - sp_name = kv.parse_genbank_name(some_gbk) - contigs.append((record.name, len(record))) - sp_strain = sp_name[2] - if not os.path.isdir('circos/karyotypes/'): - os.makedirs('circos/karyotypes/') - with open('circos/karyotypes/karyotype_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as karyotype: - color = [np.random.randint(0,255), np.random.randint(0,255), np.random.randint(0,255)] - for contig in contigs: - if contig[1] > 1000: - karyotype.write('chr - {0}{1} {2} {3} {4} {5},{6},{7}\n'.format( - sp_strain, - contig[0], - contig[0], - '1', - contig[1], - *color - ) - ) - else: - break - -def get_links(group=None, perc_identity='99'): - hits_collection = kv.get_collection('hits') - group_hits = None - if not os.path.isdir('circos/links/'): - os.makedirs('circos/links/') - out_name = 'circos/links/all_links_{}.txt'.format(perc_identity) - if group: - groups = core_hgt_groups() - group_hits = sorted(groups, key=len, reverse=True)[group - 1] - out_name = 'circos/links/group{}_links_{}.txt'.format(group, perc_identity) - - with open(out_name, 'w+') as out_handle: - for species in hits_collection.find(): - print species - try: - all_hits = species['core_hits_{}'.format(perc_identity)] - hits_to_write = None - if group: - hits_to_write = {gene:all_hits[gene] for gene in all_hits if (species['species'], gene) in group_hits} - else: - hits_to_write = all_hits - for gene in hits_to_write: - if hits_to_write[gene]: - s1_record = kv.get_mongo_record(species['species'], gene) - s1_strain = kv.parse_species_name(species['species']) - for hit in hits_to_write[gene]: - s2_record = kv.get_mongo_record(hit[0], hit[1]) - s2_strain = kv.parse_species_name(hit[0]) - out_handle.write('{0}kvc_{1} {2} {3} {4}kvc_{5} {6} {7}\n'.format( - s1_strain[2], - s1_record['location']['contig'], - s1_record['location']['start'], - s1_record['location']['end'], - s2_strain[2], - s2_record['location']['contig'], - s2_record['location']['start'], - s2_record['location']['end'], - ) - ) - except KeyError: - pass - - -def get_gc(some_gbk): - """ - Get GC content for contigs in genbank file, output file for circs line plot - - """ - with open (some_gbk, 'r') as file_handle: - gc_points = [] - sp_name = None - for record in SeqIO.parse(file_handle, 'gb'): - for b in range(len(record))[500::1000]: - gc_cont = SeqUtils.GC(record.seq[b-500:b+499]) - gc_points.append((record.name, b-500, b+499, gc_cont)) - - sp_name = kv.parse_genbank_name(some_gbk) - sp_strain = sp_name[2] - with open('circos/GC/gc_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as out_handle: - values = [x[3] for x in gc_points] - stats = (min(values), max(values), np.average(values), np.std(values)) - out_handle.write("# Min: {}\n# Max: {}\n# Avg, Std: {}, {}\n".format( - stats[0], stats[1], stats[2], stats[3] - ) - ) - for point in gc_points: - out_handle.write('{0}{1} {2} {3} {4}\n'.format( - sp_strain, - point[0], - point[1], - point[2], - point[3] - ) - ) - return stats - -def gc_circos(gb_folder): - if not os.path.isdir('circos/GC/'): - os.makedirs('circos/GC/') - with open('circos/GC/plots.conf', 'w+') as out_handle: - out_handle.write('\n') - for f in os.listdir(gb_folder): - if f.endswith('.gb'): - print f - stats = get_gc('{}{}'.format(gb_folder, f)) - out_handle.write(""" - show = yes - type = line - - file = GC/gc_{}.txt - r1 = 0.95r - r0 = 1.05r - max = {} - min = {} - - color = grey - thickness = 1 - - - - condition = var(value) > {} - color = dgreen - - - condition = var(value) < {} - color = dred - - - \n - """.format(f[:-13], stats[1], stats[0], stats[2]+stats[3], stats[2]-stats[3]) - ) - out_handle.write('\n') - -if __name__ == '__main__': - os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') - kv.mongo_init('pacbio2') - - # if not os.path.isdir('circos'): - # os.makedirs('circos') - # for f in os.listdir('validated_gbk/'): - # if f.endswith('.gb'): - # print f - # get_karyotype('validated_gbk/{}'.format(f)) - - get_links(perc_identity='95') - get_links(perc_identity='90') - - # gc_circos('validated_gbk/') - - # for entry in kv.get_collection('core').distinct('species'): - # print 'karyotypes/karyotype_{}.txt;'.format(entry) - - \ No newline at end of file diff --git a/code/KvDataStructures.py b/code/KvDataStructures.py deleted file mode 100644 index 9118a29..0000000 --- a/code/KvDataStructures.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -'''Functions for interacting with information from MongoDB''' - -from pymongo import MongoClient, ASCENDING, DESCENDING -from bson.objectid import ObjectId -import re - - -class mongo_iter(object): - """**DEPRECATED** Iterator that steps through species in mongoDB. Call with: - `for current_species_collection in mongo_iter(mongo_db_name):`""" - def __init__(self): - self.index = -1 - self.collections = get_species_collections() - - def __iter__(self): - return self - - def next(self): - if self.index == len(self.collections) - 1: - raise StopIteration - else: - self.index += 1 - return get_collection(self.collections[self.index]) - -client = MongoClient() -db = None - -def parse_genbank_name(some_genbank): - parsed = re.search(r"([A-Za-z]+)_([A-Za-z]+)_([\w-]+)_validated\.gb", some_genbank) - return (parsed.group(1), parsed.group(2), parsed.group(3)) - -def parse_species_name(sp_name): - parsed = re.search(r"^([A-Za-z]+)_([A-Za-z]+)_([\w-]+)$", sp_name) - return (parsed.group(1), parsed.group(2), parsed.group(3)) - -def get_gene_location(location): - ''' - Takes BioPython location object eg. `[1030:1460](-)` and extracts integer values. - Returns (start, stop, direction) - ''' - location_parse = re.search(r'\[?(\d+)\]\((\+|-)\)', str(location)) - if location_parse.group(3) == '+': - direction = 1 - elif location_parse.group(3) == '-': - direction = -1 - return (int(location_parse.group(1)), int(location_parse.group(2)), direction) - -def mongo_init(mongo_db_name): - global db - db = client[mongo_db_name] - return db.name - -def get_collections(): - return db.collection_names(False) - -def get_collection(collection): - return db[collection] - -def remove_collection(collection): - print get_collections() - db.drop_collection(collection) - -def reset_database(database): - mongo_init(database) - print "Now you see it:" - print db.collection_names(False) - for collection in db.collection_names(False): - db.drop_collection(collection) - print "Now you don't!" - print db.collection_names(False) - -def get_species_collections(): - collections = db.collection_names(False) - if '16S' in collections: - collections.remove('16S') - if 'hits' in collections: - collections.remove('hits') - return collections - -def get_mongo_record(species, mongo_id, db='core'): - return get_collection(db).find_one({'species':species, '_id':ObjectId(mongo_id)}) - -def get_genus(species_name): - return re.search(r'^(\w+)', species_name).group(1) - -def view_record(): - for item in get_species_collections(): - print item - counter = 1 - for collection in get_collection(item).find(): - counter += 1 - if counter > 3: - break - -def fasta_id_parse(fasta_id): - """"species_name|_id" -> (species_name, _id)""" - parsed = re.search(r'(\w+)\|(\w+)', fasta_id) - return (parsed.group(1), parsed.group(2)) - -def index_contigs(species, collection='core'): - curr_collection = get_collection(collection) - return curr_collectioncollection.find({'species':species}).sort([("location.contig", ASCENDING), ("location.start", ASCENDING)]) - -def concat_contigs(species, collection='core'): - current_contig = 0 - last_contig_end = 0 - last_gene_end = 0 - concatenated = {} - for gene in index_contigs(species, collection): - if gene['species'] == species: - if gene['location']['contig'] == current_contig: - gene['location']['start'] += last_contig_end - gene['location']['end'] += last_contig_end - last_gene_end = gene['location']['end'] - - concatenated[gene['_id']] = gene - else: - current_contig = gene['location']['contig'] - last_contig_end = last_gene_end - - gene['location']['start'] += last_contig_end - gene['location']['end'] += last_contig_end - last_gene_end = gene['location']['end'] - - concatenated[gene['_id']] = gene - - return concatenated - -def make_gene_fasta(gene_record, to_file=True): - entry = ">{}|{}\n{}\n".format(gene_record['species'].replace(' ', '_'), gene_record['_id'], gene_record['dna_seq']) - - if to_file: - fasta = '{}_{}.fna'.format(gene_record['species'], gene_record['kvtag']) - with open(fasta, 'w+') as output_handle: - output_handle.write(entry) - return fasta - else: - return entry - -def make_seq_fasta(seq, name='temp'): - fasta = '{}.fasta'.format(name) - with open(fasta, 'w+') as output_handle: - output_handle.write( - ">{}\n{}\n".format(name, seq) - ) - return fasta - -def make_id_list_fasta(id_list, db): - """ - Make fasta from list of `(species, _id)` tuples - """ - genes = [] - for gene_id in id_list: - genes.append(make_gene_fasta(get_mongo_record(*gene_id, db=db), to_file=False)) - with open('tmp.fna', 'w+') as output_handle: - for gene in genes: - output_handle.write(gene) - - - -if __name__ == '__main__': - test = 'validated_genbank/Arthrobacter_arilaitensis_3M03_validated.gb' - print parse_genbank_name(test) \ No newline at end of file diff --git a/code/KvasirBlast.py b/code/KvasirBlast.py deleted file mode 100644 index 95e263d..0000000 --- a/code/KvasirBlast.py +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -''' -Must have Mongod running, in terminal: `mongod --dbpath path/to/db` -''' - -import os, re -from subprocess import Popen, PIPE -from Bio.Blast import NCBIXML -from Bio.Blast.Applications import NcbiblastnCommandline -from bson.objectid import ObjectId -from pymongo.cursor import Cursor -from KvasirHGT import core_hgt_groups -import KvDataStructures as kv - -def make_blast_db(source, name=None, remove_source=True): - """ - Produces BLAST database from `source` - Optional - provide name (defaults to `source`) - Set remove_source=False to keep fasta file (if created) - - Source types: - - fasta file (use path, must end with `.fna`) - - Mongo collection (use name of collection) - - list of dicts containing at least keys `species`, `_id`, `dna_seq` - - Mongo cursor eg. `collection.find({'key':value})` - """ - # If there's no directory for blast db's, create one - if not os.path.isdir('blast_databases/'): - os.makedirs('blast_databases/') - - output_fasta = None - - if os.path.isfile(source): - # Input is fasta file? - if source.endswith('.fna'): - output_fasta = source - if not name: - name = os.path.basename(source)[:-4] - remove_source = False - else: - print "Not a valid file type, use .fna" - - else: - output_fasta = '{0}_all.fasta'.format(kv.db.name) - genes = None - with open(output_fasta, 'w+') as output_handle: - if source in kv.get_collections(): - genes = kv.get_collection(source).find() - if not name: - name = source - elif type(source) == list: - genes = source - elif type(source) == Cursor: - genes = source - - for gene in genes: - output_handle.write('>{0}|{1}\n{2}\n'.format( - gene['species'], - gene['_id'], - gene['dna_seq'], - ) - ) - - while not name: - name = str(raw_input("enter name for BLAST database: ")) - - # calls makeblastdb from shell - print "making a database!" - Popen( - ['makeblastdb', - '-in', output_fasta, - '-dbtype', 'nucl', - '-out', 'blast_databases/{0}'.format(name), - '-title', name, - ] - ).wait() # waits for this operation to terminate before moving on - - if remove_source: - os.remove(output_fasta) - -def blast_vs_fasta(query, subject): - """ - Blast `query` against `subject`. Both must be paths to fasta file - Returns list of lists, each `[sseqid, qseqid, pident, length]` - """ - out = Popen( - ['blastn', - '-query', query, - '-subject', subject, - '-outfmt', '10 sseqid qseqid pident length', - ], stdout=PIPE - ).communicate()[0] - if out: - result = re.split(r'[,\n]', out.rstrip())[0] - return [result[i:i+4] for i in range(len(result))[0::4]] - -def blast_vs_db(query, db): - """ - Blast `subject` (fasta file) against `db` (blast db). - Returns list of lists, each `[qseqid, sseqid, pident, length]` - """ - out = Popen( - ['blastn', - '-query', query, - '-db', db, - '-outfmt', '10 qseqid sseqid pident length', - ], stdout=PIPE - ).communicate()[0] - if out: - result = re.split(r'[,\n]', out.rstrip()) - return [result[i:i+4] for i in range(len(result))[0::4]] - -def core_hgt_blast(perc_identity='99'): - """ - Blasts all core genomes against core db - - Set `perc_identity` if desired (default = 99) - """ - if not os.path.isdir('blast_results/core/'): - os.makedirs('blast_results/core/') - for species in kv.get_collection('core').distinct('species'): - query_fasta = 'blast_results/core/{}_tmp.fna'.format(species) - - with open(query_fasta, 'w+') as query_handle: - for query in kv.get_collection('core').find({'species':species}): - if query['type'] == 'gene': - query_handle.write('>{0}|{1}\n{2}\n'.format( - query['species'], - query['_id'], - query['dna_seq'] - ) - ) - - print 'Blasting {0}'.format(species) - out = Popen( - ['blastn', - '-query', query_fasta, - '-db', 'blast_databases/core', - '-outfmt', '5', - '-out', 'blast_results/core/{}_{}_blast.xml'.format(species, perc_identity), - '-perc_identity', perc_identity - ], - stdout=PIPE - ).communicate()[0] - - os.remove(query_fasta) - -def blast_to_db(db='core', perc_identity='99'): - blast_dir = 'blast_results/{}/'.format(db) - for f in os.listdir(blast_dir): - if f.endswith('{}_blast.xml'.format(perc_identity)): - file_handle = 'blast_results/{}/{}'.format(db,f) - with open(file_handle, 'r') as result_handle: - blast_records = NCBIXML.parse(result_handle) - hits_dict = {} - for blast_record in blast_records: - query_parse = re.search(r'(\w+)\|(\w+)', blast_record.query) - query_genus_parse = re.match(r'([A-Za-z]+)_', blast_record.query) - query_genus = query_genus_parse.group(1) - query_name = query_parse.group(1) - query_id = query_parse.group(2) - - hits_dict[query_id] = [] - - for alignment in blast_record.alignments: - hit_parse = re.search(r'(\w+)\|(\w+)', alignment.hit_def) - hit_genus_parse = re.match(r'([A-Za-z]+)_', alignment.hit_def) - hit_genus = hit_genus_parse.group(1) - - hit_name = hit_parse.group(1) - hit_id = hit_parse.group(2) - if query_name == hit_name: - pass - elif query_genus == hit_genus: - print "Oops! {} and {} are the same genus, skipping...".format(query_name, hit_name) - pass - elif kv.get_mongo_record(hit_name, hit_id)['type'] == '16S': - print 'Skipping 16S hit' - else: - print '=======\nhit for {0} detected:\nspecies: {1}\n======='.format(query_name, hit_name) - hits_dict[query_id].append((hit_name, hit_id)) - - print 'Updataing mongoDB with hits' - hits_collection = kv.get_collection('hits') - hits_collection.update_one( - {'species':query_name}, - {'$set':{'{}_hits_{}'.format(db, perc_identity):{x:hits_dict[x] for x in hits_dict if hits_dict[x]}}}, - upsert=True - ) - -def hits_reset(): - kv.remove_collection('hits') - -def other_blast(): - groups_list = core_hgt_groups() - groups_list.sort(key=len, reverse=True) - - for i in range(len(groups_list)): - group_hits = [] - kv.make_id_list_fasta(groups_list[i], 'core') - results = blast_vs_db('tmp.fna', 'blast_databases/other') - - hits_collection = kv.get_collection('hits') - if results: - for j in range(len(results)): - group_hits.append(results[j]) - hits_collection.insert_one({'group':(i+1), 'group_hits':group_hits}) - - -if __name__ == '__main__': - import sys - kv.mongo_init('pacbio2') - os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') - # kv.mongo_init(sys.argv[1]) - # os.chdir('output/{}/'.format(sys.argv[1])) - make_blast_db('core') - make_blast_db('other') - hits_reset() - hgt_blast(perc_identity='90') - hgt_blast(perc_identity='95') - hgt_blast(perc_identity='99') - blast_to_db(perc_identity='90') - blast_to_db(perc_identity='95') - blast_to_db(perc_identity='99') - - other_blast() diff --git a/code/KvasirHGT.py b/code/KvasirHGT.py deleted file mode 100644 index 5caa730..0000000 --- a/code/KvasirHGT.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import KvDataStructures as kv -import pandas as pd - -def core_hgt_stats(perc_identity='99'): - """ - Returns stats of HGT (number of events etc) - """ - collection = kv.get_collection('core') - df_index = ['Total_CDS', 'HGT_CDS', 'Islands'] - df = pd.DataFrame() - for species in collection.distinct('species'): - hits = kv.get_collection('hits').find_one({'species':species})['core_hits_{}'.format(perc_identity)] - series = pd.Series([ - sum([1 for x in collection.find({'species':species})]), - sum([1 for x in hits if hits[x]]), - len(get_islands(species)) - ], name=species, index=df_index) - df = df.append(series) - - df.to_csv('stats.csv', columns=df_index) - -def collapse_lists(list_of_lists): - """ - Reduces list of lists by combining lists with identical elements - **Example input**: [[1,2,3],[3,4],[5,6,7],[1,8,9,10],[11],[11,12],[13],[5,12]] - **Example output: [[1,2,3,4,8,9,10],[5,6,7,11,12],[13]] - - from stackoverflow user YXD: http://stackoverflow.com/questions/30917226/collapse-list-of-lists-to-eliminate-redundancy - """ - result = [] - for l in list_of_lists: - s = set(l) - - matched = [s] - unmatched = [] - # first divide into matching and non-matching groups - for g in result: - if s & g: - matched.append(g) - else: - unmatched.append(g) - # then merge matching groups - result = unmatched + [set().union(*matched)] - return result - -def get_islands(species_name, perc_identity='99'): - """ - For each species, combines HGT hits co-occurring within 5kb of eachother - Returns list of lists of `(species, _id)` tuples - """ - - islands = [] - species_hits_list = [] - - # Add mongo_record for each hit in any gene - all_hits = kv.get_collection('hits') - species_hits = all_hits.find_one({'species':species_name})['core_hits_{}'.format(perc_identity)] - - - for query_id in species_hits: - if species_hits[query_id]: - species_hits_list.append( - kv.get_mongo_record(species_name, query_id) - ) - - for entry_1 in species_hits_list: - entry_recorded = False - for entry_2 in species_hits_list: - if entry_1 == entry_2: - pass - elif entry_1['location']['contig'] != entry_2['location']['contig']: - pass - else: - location_1 = entry_1['location'] - location_2 = entry_2['location'] - if abs(location_1['end'] - location_2['start']) <= 5000: - entry_recorded = True - islands.append([ - (entry_1['species'], str(entry_1['_id'])), - (entry_2['species'], str(entry_2[ '_id'])) - ]) - if not entry_recorded: - islands.append([(entry_1['species'], str(entry_1['_id']))]) - - return collapse_lists(islands) - -def core_hgt_groups(perc_identity='99'): - """ - Returns mutilspecies groups of genes as list of lists. - - - Starts with species islands `get_islands()` - - For each island, if a hit from that island is in another island, - group them together - """ - all_hits = kv.get_collection('hits') - groups_list = [] - for s in all_hits.distinct('species'): - s_hits = all_hits.find_one({'species':s}) - current_species_islands = get_islands(s_hits['species']) - - # each sublist represents one island... - for island in current_species_islands: - if island: # many lists are empty, skip those - hit_set = set() # container for hits - for gene_id in island: - gene_hits = s_hits['core_hits_{}'.format(perc_identity)][gene_id[1]] - - # Pulls each hit id tuple, then appends it to group_set - for hit in gene_hits: - hit_set.add((hit[0], hit[1])) - # add id tuples for hits to island list... - island.update(hit_set) - # And add new island (with multiple species) to groups_list - groups_list.append(list(island)) - - # Since each species' islands are built independently, there's a lot of redundancy - # So... Collapse lists that contain shared elements and deduplicate - return map(list, collapse_lists(groups_list)) - -def output_groups(min_group_size=2): - """ - Returns .csv file with information for each CDS in an HGT group - - - Optional: set minimum number of CDS to be considered a group - """ - output_file = '{}_groups.csv'.format(kv.db.name) - df_index = ['group','kvtag','contig','start','end','strand','annotation','dna_seq'] - df = pd.DataFrame() - group_no= 0 - groups_list = core_hgt_groups() - groups_list.sort(key=len, reverse=True) - - for group in groups_list: - if len(group) >= min_group_size: - group.sort(key=lambda entry:entry[0]) - group_no += 1 - for entry in group: # Entry is `(species, id)` - - db_handle = kv.get_mongo_record(*entry) - annotation = db_handle['annotation'].replace(',','') # prevents CSV screw-up - series = pd.Series( - [str(group_no).zfill(3), - db_handle['kvtag'], - db_handle['location']['contig'], - db_handle['location']['start'], - db_handle['location']['end'], - db_handle['location']['strand'], - annotation, - db_handle['dna_seq'] - ], - index=df_index, - name=db_handle['species'] - ) - df=df.append(series) - df.to_csv(output_file, columns=df_index) - -def group_hits(core=False): - all_species = kv.get_collection('core').distinct('species') - if not core: - all_species.extend(kv.get_collection('other').distinct('species')) - - - hits_db = kv.get_collection('hits') - species_index = sorted(all_species) - print species_index - df = pd.DataFrame() - core_groups = sorted(core_hgt_groups(), key=len, reverse=True) - - - for group in sorted(hits_db.distinct('group')): - recorded = [] - s = {sp:0.0 for sp in species_index} - for hit in core_groups[group-1]: - if not hit in recorded: - s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq']) - recorded.append(hit) - - for hit in hits_db.find_one({'group':group})['group_hits']: - if float(hit[2]) > 90 and float(hit[3]) > 100: - if hit[1] not in recorded: - s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100 - recorded.append(hit[1]) - - s = pd.Series(s, name='group_{}'.format(group)) - df['group_{}'.format(group)] = s - - df.to_csv('group_hits_other.csv') - -# if __name__ == '__main__': -# import os -# kv.mongo_init('pacbio2') -# os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') -# # group_hits(core=True) -# # output_groups() -# # core_hgt_stats() -# output_hits_csv() \ No newline at end of file diff --git a/code/KvasirPhylo.py b/code/KvasirPhylo.py deleted file mode 100644 index d4b6ef0..0000000 --- a/code/KvasirPhylo.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import os - -import KvDataStructures as kv -import pandas as pd - -from itertools import combinations_with_replacement -from subprocess import Popen, PIPE -from skbio import DNA -from skbio import DistanceMatrix -from skbio import tree -from skbio.alignment import StripedSmithWaterman - -def get_gene_distance(seq_1, seq_2): - """ - Returns hamming distance between two DNA sequences - Alignment based on Striped Smith-Waterman algorithm - """ - query = StripedSmithWaterman(seq_1.upper()) - alignment = query(seq_2.upper()) - q = DNA(alignment.aligned_query_sequence) - t = DNA(alignment.aligned_target_sequence) - return q.distance(t) - -def get_perc_id(seq_1, seq_2): - return 1.0 - get_gene_distance(seq_1, seq_2) - -def get_16S_distance(species_1, species_2): - """ - Returns hamming distance of the 16S seq of two species in MongoDB - """ - if species_1 == species_2: - return 0.0 - else: - s1_ssu = kv.db['16S'].find_one({'species':species_1}) - s2_ssu = kv.db['16S'].find_one({'species':species_2}) - if s1_ssu and s2_ssu: - if len(s1_ssu['dna_seq']) > 800 and len(s1_ssu['dna_seq']) > 800: - return get_gene_distance(str(s1_ssu['dna_seq']), str(s2_ssu['dna_seq'])) - else: - return None - else: - return None - -def get_distance_matrix(core=False, to_file=True): - all_species = kv.get_collection('core').distinct('species') - if core: - pass - else: - all_species.extend(kv.get_collection('other').distinct('species')) - - ssu_species = [n for n in all_species if kv.db['16S'].find_one({'species':n})] - distance_matrix = pd.DataFrame(data={n:0.0 for n in ssu_species}, index=ssu_species, columns=ssu_species) - - for pair in combinations_with_replacement(ssu_species, 2): - distance = get_16S_distance(pair[0], pair[1]) - if distance: - distance_matrix[pair[0]][pair[1]] = distance - distance_matrix[pair[1]][pair[0]] = distance - - if to_file: - distance_matrix.to_csv('distance_matrix.csv') - else: - return distance_matrix - -def get_tree(core=False, newick=False): - core_collection = kv.get_collection('core') - all_species = core_collection.distinct('species') - if core: - pass - else: - other_collection = kv.get_collection('other') - all_species.extend(other_collection.distinct('species')) - ssu_species = [n for n in all_species if kv.db['16S'].find_one({'species':n})] - - dm = DistanceMatrix(get_distance_matrix(core=core, to_file=False), ssu_species) - t = tree.nj(dm) - print t.ascii_art() - tips = [] - for node in t.tips(): - print node.name, node.length - tips.append(node.name.replace(' ', '_')) - if newick: - n = tree.nj(dm, result_constructor=str) - print n - else: - return (t, tips) - -def ssu_fasta(): - with open('16s.fna', 'w+') as out_handle: - for species in kv.get_collection('16S').distinct('species'): - ssu = kv.get_collection('16S').find_one({'species':species}) - if ssu: - out_handle.write(kv.make_gene_fasta(ssu, to_file=False)) - else: - print species - -def add_ssu(supp_file): - # df = pd.read_csv(supp_file) - # print df.columns - # new_df = pd.DataFrame() - # # for i in range(len(df['Strain'])): - # # print df['Strain'][i].replace(' ', '_').replace('.', '') - - # strain = pd.Series([df['Strain'][i].replace(' ', '_').replace('.', '') for i in range(len(df['Strain']))], name='strain') - # ssus = df['sequences of the 16s rRNA genes'] - - # ssu = pd.Series([ssus[i].replace(r'\n', '') if not pd.isnull(ssus[i]) else None for i in range(len(ssus))], name='16S') - # new_df['strain'] = strain - # new_df['16S'] = ssu - - # new_df.to_csv('ssu.csv') - - ssu_df = pd.read_csv(supp_file) - for i in range(len(ssu_df['strain'])): - print ssu_df['strain'][i], ssu_df['16S'][i] - if not pd.isnull(ssu_df['16S'][i]): - gene_record = { - 'species':ssu_df['strain'][i], - 'location':{ - 'contig':None, - 'start':None, - 'end':None, - 'strand':None, - }, - 'annotation':'Small subunit ribosomal RNA', - 'dna_seq':ssu_df['16S'][i], - 'kvtag':None, - 'type':'16S' - } - - print "adding 16S gene!" - kv.get_collection('16S').remove({'species':ssu_df['strain'][i]}) - print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]}) - kv.get_collection('16S').insert_one(gene_record) - print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]}) - -def get_list_from_tree(newick_file): - pass - - -if __name__ == '__main__': - os.chdir('/Users/KBLaptop/computation/kvasir/data/output/reorg/') - kv.mongo_init('reorg') diff --git a/code/reset_db.py b/code/reset_db.py deleted file mode 100644 index 1177214..0000000 --- a/code/reset_db.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -import sys -import KvDataStructures as kv - -kv.mongo_init(sys.argv[1]) -kv.reset_database(kv.db.name) \ No newline at end of file diff --git a/code/run_kvasir.py b/code/run_kvasir.py deleted file mode 100644 index 9ad72a8..0000000 --- a/code/run_kvasir.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# by Kevin Bonham, PhD (2015) -# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA -# Unless otherwise indicated, licensed under GNU Public License (GPLv3) - -''' -- Create directories `path/input/` and `path/output/` -- Put genbank files into `path/input/core` or `path/input/other` -- Must have Mongod running, in terminal: `mongod --dbpath path/to/db` -- run `python run_kvasir.py [run_name]` from `path/` -''' - -import os -import sys -import datetime -import DataImport, FixGbk, KvasirBlast -from KvasirHGT import output_groups -from KvDataStructures import mongo_init, reset_database - -print 'Here we go!' - -core_gbk_folder = os.path.abspath('data/input/core/') -other_gbk_folder =os.path.abspath('data/input/other/') -exp_name = datetime.datetime.now().strftime('%Y%m%d%H%M') - -mongo_init(exp_name) -reset_database(exp_name) - -new_folder = 'data/output/{0}/'.format(exp_name) -if not os.path.isdir(new_folder): - os.makedirs(new_folder) -os.chdir(new_folder) - -print 'Checking and importing core genomes...' - -for the_file in os.listdir(core_gbk_folder): - path_to_file = '{0}/{1}'.format(core_gbk_folder, the_file) - DataImport.import_file(path_to_file, 'core') - -print 'Checking and importing other genomes...' - -for the_file in os.listdir(other_gbk_folder): - path_to_file = '{0}/{1}'.format(other_gbk_folder, the_file) - DataImport.import_file(path_to_file, 'other') - - -KvasirBlast.make_blast_db('core') -KvasirBlast.make_blast_db('other') -KvasirBlast.core_hgt_blast(perc_identity='90') -# KvasirBlast.core_hgt_blast(perc_identity='95') -# KvasirBlast.core_hgt_blast(perc_identity='99') -KvasirBlast.blast_to_db(perc_identity='90') -# KvasirBlast.blast_to_db(perc_identity='95') -# KvasirBlast.blast_to_db(perc_identity='99') -output_groups() - diff --git a/requirements.txt b/requirements.txt index 365f886..2cf807d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ pymongo -Biopython \ No newline at end of file +Bio diff --git a/run.py b/run.py index 58ba82f..3c36619 100644 --- a/run.py +++ b/run.py @@ -1,4 +1,6 @@ from settings import * +import sys +sys.path.append('src/') from DataImport.mongo_import import mongo_import_genbank from FindHGT.make_blast_db import make_blast_db, db_cds_to_fna from FindHGT.run_blast import blast_all, parse_blast_results_xml @@ -18,36 +20,35 @@ def blast_db(): if not os.path.isdir(db_path): os.makedirs(db_path) - make_blast_db(fasta.name, "nucl", os.path.join(db_path, "genes")) # Collection name option? (see ln9 above) + make_blast_db(fasta.name, "nucl", os.path.join(db_path, "genes")) # Collection name option? (see ln10 above) def blast(): - fasta = db_cds_to_fna('genes') # Collection name option? (see ln9 above) + fasta = db_cds_to_fna('genes') # Collection name option? (see ln10 above) db_path = os.path.join(OUTPUT, MONGODB.name, "blast_db", "genes") blast_results = blast_all(fasta, db_path) parse_blast_results_xml(blast_results) -def analyze(): - groups = output.hgt_groups(0.99, minimum_length=500, dist_between_hits=5000) - output.output_groups(groups, os.path.join(OUTPUT, "99-500-5000-groups-test.csv")) - # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000) - # output.output_groups(groups, os.path.join(OUTPUT, "99-400-5000-groups.csv")) - # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000) - # output.output_groups(groups, os.path.join(OUTPUT, "99-00-5000-groups.csv")) - # groups = output.hgt_groups(0.99, minimum_length=300, dist_between_hits=5000) - # output.output_groups(groups, os.path.join(OUTPUT, "99-300-5000-groups.csv")) - # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000) - # output.output_groups(groups, os.path.join(OUTPUT, "99-400-5000-groups.csv")) - # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000) - # output.output_groups(groups, os.path.join(OUTPUT, "99-400-5000-groups.csv")) - +def analyze(minimum_identity, minimum_length=500, dist_between_hits=5000): + groups = output.hgt_groups(minimum_identity, minimum_length, dist_between_hits) + output.output_groups( + groups, os.path.join( + OUTPUT, "{}-{}-{}-groups.csv".format( + int((minimum_identity)*100), minimum_length, dist_between_hits + ) + ) + ) +""" +WIP +""" def run_circos(): + pass # circos.get_karyotypes() - circos.get_links(0.99, 500, group_no=2, min_dist=3000, annotation="transposase") - circos.get_links(0.99, 500, group_no=1, min_dist=3000, annotation="transposase") + # circos.get_links(0.99, 500, group_no=2, min_dist=3000, annotation="transposase") + # circos.get_links(0.99, 500, group_no=1, min_dist=3000, annotation="transposase") # circos.get_links(0.90, 400, 0.95) # circos.get_gc_conf() # circos.get_conf_file("path/to/output/database_name/circos/links/99-400-links.txt", diff --git a/settings.py b/settings.py index 0ab289c..772f0a7 100644 --- a/settings.py +++ b/settings.py @@ -1,5 +1,5 @@ import pymongo INPUT = "/path/to/input/" # path to folder with genbank files -OUTPUT = "/Users/KBLaptop/computation/kv_data/output/database_name/" # path to output folder +OUTPUT = "/path/to/output" # path to output folder MONGODB = pymongo.MongoClient()["database_name"] diff --git a/src/Analysis/output.py b/src/Analysis/output.py index 6fe3d7b..a53e234 100644 --- a/src/Analysis/output.py +++ b/src/Analysis/output.py @@ -107,11 +107,11 @@ def find_all_hits(some_id, minimum_identity, minimum_length=100): as_query = {"type": "blast_result", "query": str(some_id)} # maybe should use ObjectId on import, see issue#10 as_subject = {"type": "blast_result", "subject": str(some_id)} - blast_hits = collection.find({"$or": [as_query, as_subject], - "perc_identity": {"$gte": minimum_identity}, - "length": {"$gte": minimum_length} - } - ) # will evaluate None if no pair is found + blast_hits = collection.find({"$or": [as_query, as_subject], + "perc_identity": {"$gte": minimum_identity}, + "length": {"$gte": minimum_length} + } + ) # will evaluate None if no pair is found # we only need to get the hit back if blast_hits: diff --git a/test.txt b/test.txt deleted file mode 100644 index e69de29..0000000