From 47c75d8d6737d857f8cf27756e5c11f0c45236c3 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevbonham@gmail.com>
Date: Fri, 27 May 2016 12:35:31 -0400
Subject: [PATCH] Clean-up (#14)

* Clean-up

* fixed tab in output.py

* removed old code/ folder

* more cleanup
---
 .gitignore                    |   4 -
 README.md                     |  43 ++--
 code/DataImport.py            | 130 ----------
 code/DeprecatedOutputs.py     | 268 --------------------
 code/DeprecatedPhylo.py       | 453 ----------------------------------
 code/Deprecatedget_outputs.py |  23 --
 code/FixGbk.py                |  78 ------
 code/KvCircos.py              | 177 -------------
 code/KvDataStructures.py      | 168 -------------
 code/KvasirBlast.py           | 229 -----------------
 code/KvasirHGT.py             | 201 ---------------
 code/KvasirPhylo.py           | 148 -----------
 code/reset_db.py              |  10 -
 code/run_kvasir.py            |  56 -----
 requirements.txt              |   2 +-
 run.py                        |  37 +--
 settings.py                   |   2 +-
 src/Analysis/output.py        |  10 +-
 test.txt                      |   0
 19 files changed, 44 insertions(+), 1995 deletions(-)
 delete mode 100644 code/DataImport.py
 delete mode 100644 code/DeprecatedOutputs.py
 delete mode 100644 code/DeprecatedPhylo.py
 delete mode 100644 code/Deprecatedget_outputs.py
 delete mode 100644 code/FixGbk.py
 delete mode 100644 code/KvCircos.py
 delete mode 100644 code/KvDataStructures.py
 delete mode 100644 code/KvasirBlast.py
 delete mode 100644 code/KvasirHGT.py
 delete mode 100644 code/KvasirPhylo.py
 delete mode 100644 code/reset_db.py
 delete mode 100644 code/run_kvasir.py
 delete mode 100644 test.txt

diff --git a/.gitignore b/.gitignore
index faeeb03..5f82b27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,3 @@
 .DS_store
 *ipynb*
-*.sublime-project
-*.sublime-workspace
 *.pyc
-.idea
-user_settings.py
\ No newline at end of file
diff --git a/README.md b/README.md
index df00e81..14dc4e2 100644
--- a/README.md
+++ b/README.md
@@ -7,36 +7,29 @@ Dependencies:
 * MongoDB
   * pymongo
 * BioPython
-* ~~Scikit Bio~~
-* ~~Pandas~~
+* BLAST+ CLI
 
 **Identification of horizontal gene transfer between sequenced microbial genomes**
 
-##The following is out of date... Will get back to this soon
+Kvasir takes as the input a folder containing genomes in genbank format. The protein coding genes from these genomes are loaded into a database, and blasted against each other.
 
-##~~Running Kvasir~~
-~~With dependencies installed, fire up a Mongod instance. In the terminal:~~
+### Usage
 
-~~`mongod --dbpath path/to/db`~~
+Change the values in `settings.py` to point at your input folder, output folder, and the name you want for your database.
 
-~~Run Kvasir by invoking run_kvasir.py in your working directory:~~
+Launch a local `mongod` instance:
+```
+$ mongod --dbpath path/to/db
+```
 
-~~`python run_kvasir.py /path/to/gb_files name_of_mongoDB`~~
+Run functions in `run.py`. Eventually, this will get more streamlined, for now...
 
-####~~DataImport~~:
-* ~~Imports genbank-formated annotated genomes into Mongo database.~~
-* ~~.gb files require "locus_tag" feature. If your genomes don't have it, FixGbk.py shoul take care of it for you~~
-* ~~Mongo database has "collections" and "documents" - a different collection is generated for each species (each separate genbank file) and documents representing each CDS. ~~
-    * ~~CDS documents are like python dictionaries, and contain entries for species, DNA and amino acid protein sequences, contig and location info, and annotation information.~~
-    * ~~each document is assigned a uniqe `_id` attribute within the species, so every gene is uniquely identified by a `(species, _id)` tuple~~
-
-####~~MakeBlastDb~~
-~~Generates a multi-fasta file containing every gene in the mongo database, generates a BLASTable database, then deletes the temprorary file~~
-
-####~~KsavirBlast~~
-* ~~For each species, generates a temporary fasta file and BLASTs against every other gene in the database~~
-* ~~BLAST generates and xml document, which is parsed for unique hits~~
-* ~~new "hits" entry is added to each gene document in MongoDB, which contains a list of `(species, _id)` tuples for each hit (these are used in the next script to gather info about hits)~~
-
-####~~Outputs~~
-~~Still a work in progress. So far, have a bunch of output formates working... will detail later.~~
+```
+Python 2.7.11 (default, Dec 14 2015, 10:44:13)
+[GCC 4.2.1 Compatible Apple LLVM 7.0.2 (clang-700.1.81)] on darwin
+Type "help", "copyright", "credits" or "license" for more information.
+>>> import run
+>>> run.import_data()
+>>> run.blast_db()
+>>> run.analyze(0.99)
+```
diff --git a/code/DataImport.py b/code/DataImport.py
deleted file mode 100644
index 6356ba9..0000000
--- a/code/DataImport.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-'''
-This script is designed to import data from a genbank file and
-write it to a mongoDB database. Must have Mongod running, in terminal:
-`mongod --dbpath path/to/db`.
-'''
-
-import re, os
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
-import KvDataStructures as kv
-
-def import_file(some_genbank, collection):
-    """
-    Import records from `some_genbank` into Mongo `collection`
-    * Imports each coding sequence (CDS) as  document of {'type':'gene'}
-    * Imports up to one 16S rRNA sequences as document of {'type':'16S'}
-    * Each document has info on species, contig and location, DNA sequence and (for CDS) amino acid sequence
-    * Each gene in genbank file MUST have `locus_tag` feature. If it doesn't, use `add_locus_tags()`
-        * Note - `add_locus_tags()` doesn't exist yet, will be similar to `FixGbk.validate_gbk()`
-    """
-    with open(some_genbank, 'r') as open_file:
-        collection = kv.get_collection(collection)
-
-        # Each "record" in genbank file is read, corresponds to individual contigs
-        for record in SeqIO.parse(open_file, 'gb'):
-            current_contig = record.name
-            try:
-                current_species = record.annotations['source']
-            except KeyError:
-                name = re.search(r'\w+\/(.+)\.\w+$', some_genbank)
-                current_species = name.group(1)
-                
-
-            collection.insert_one({
-                'species':current_species,
-                'contig':current_contig,
-                'dna_seq':str(record.seq),
-                'type':'contig'
-                })
-
-            print "Importing {}".format(current_contig)
-            ssu_gene = get_16S(record)
-            if ssu_gene:
-                try:
-                    locus_tag = ssu_gene[0].qualifiers['locus_tag'][0]
-                except KeyError:
-                    locus_tag = None
-                
-                parsed_location = kv.get_gene_location(ssu_gene[0].location)
-                gene_record = {
-                    'species':current_species,
-                    'location':{
-                        'contig':current_contig,
-                        'start':parsed_location[0],
-                        'end':parsed_location[1],
-                        'strand':parsed_location[2],
-                    },
-                    'locus_tag':locus_tag,
-                    'annotation':ssu_gene[0].qualifiers['product'][0],
-                    'dna_seq':ssu_gene[1],
-                    'type':'16S'
-                    }
-                print "adding 16S gene!"
-                collection.insert_one(gene_record)
-                kv.get_collection('16S').insert_one(gene_record)
-
-            for feature in record.features:
-                if feature.type == 'CDS':
-                    parsed_location = kv.get_gene_location(feature.location)
-                    try:
-                        locus_tag = feature.qualifiers['locus_tag'][0]
-                    except KeyError:
-                        locus_tag = None
-
-                    gene_record = {
-                        'species':current_species,
-                        'location':{
-                            'contig':current_contig,
-                            'start':parsed_location[0],
-                            'end':parsed_location[1],
-                            'strand':parsed_location[2],
-                            'index':None
-                        },
-                        'locus_tag':locus_tag,
-                        'annotation':feature.qualifiers['product'][0],
-                        'dna_seq':get_dna_seq(feature, record),
-                        'aa_seq':feature.qualifiers['translation'][0],
-                        'type':'gene'
-                        }
-                    collection.insert_one(gene_record)
-
-def get_dna_seq(feature, record):
-    if feature.location.strand == 1:
-        dna_seq = record.seq[
-            feature.location.start:feature.location.end
-        ]
-    elif feature.location.strand == -1:
-        dna_seq = record.seq[
-            feature.location.start:feature.location.end
-        ].reverse_complement()
-    return str(dna_seq)
-
-def get_16S(gbk_record):
-    for feature in gbk_record.features:
-        if feature.type == 'rRNA':
-            if re.search(r"16[sS]|ssu|SSU", str(feature.qualifiers['product'])):
-                if len(feature) > 1000:
-                    return (feature, get_dna_seq(feature, gbk_record))
-                else:
-                    return None
-
-# Need to fix search! Only returns "contig"...
-def get_contig(record_name):
-    import re
-    parse_contig = re.search(r'kvc_(\d\d\d)', record_name)
-    return parse_contig.group(1)
-
-def import_folder(genbank_folder):
-    import os
-    print 'we\'re importing a folder now!'
-    for a_file in os.listdir(genbank_folder):
-        current_file = os.path.join(genbank_folder, a_file)
-        print 'importing {0}!'.format(current_file)
-        import_file(current_file)
diff --git a/code/DeprecatedOutputs.py b/code/DeprecatedOutputs.py
deleted file mode 100644
index 7966101..0000000
--- a/code/DeprecatedOutputs.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/env python 2.7
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import os
-from itertools import combinations, combinations_with_replacement
-from bson.objectid import ObjectId
-import pandas as pd
-import KvDataStructures as kv
-from skbio import DNA
-from skbio import DistanceMatrix
-from skbio import tree
-from skbio.alignment import StripedSmithWaterman
-from matplotlib import pyplot as plt
-import numpy as np
-
-
-def output_hits_csv():
-    hits = kv.get_collection('hits')
-    if not os.path.isdir('hits/'):
-       os.makedirs('hits/')
-
-    df_index = [
-        'parent_locus',
-        'parent_annotation',
-        'parent_seq',
-        'parent_contig',
-        'parent_start',
-        'parent_end',
-        'parent_strand',
-        'hit_tag',
-        'hit_annotation',
-        'hit_seq',
-        'hit_contig',
-        'hit_start',
-        'hit_end',
-        'hit_strand',
-    ]
-    
-    for record in hits.find():
-        query_species = record['species']
-        df = pd.DataFrame()
-
-        for query_id in record['hits']:
-            list_of_hits = record['hits'][query_id]
-            if list_of_hits:
-                query_record = kv.get_collection('core').find_one({'species':query_species, '_id':ObjectId(query_id)})
-                
-                for hit in list_of_hits:
-
-                    hit_species = hit[0]
-                    hit_id = hit[1]
-                    hit_record = kv.get_collection('core').find_one({'species':hit_species, '_id':ObjectId(hit_id)})
-                    hit_record['kvtag']
-                    query_annotation = query_record['annotation'].replace(',','')
-                    hit_annotation = hit_record['annotation'].replace(',','')
-                    
-                    series = pd.Series(
-                        [query_record['kvtag'],
-                        query_annotation,
-                        query_record['dna_seq'],
-                        query_record['location']['contig'],
-                        query_record['location']['start'],
-                        query_record['location']['end'],
-                        query_record['location']['strand'],
-                        hit_record['kvtag'],
-                        hit_annotation,
-                        hit_record['dna_seq'],
-                        hit_record['location']['contig'],
-                        hit_record['location']['start'],
-                        hit_record['location']['end'],
-                        hit_record['location']['strand'],
-                        ],
-                        index=df_index,
-                        name=hit_record['species']
-                    )
-
-                    df=df.append(series)
-        df.to_csv('hits/{}_hits.csv'.format(query_species), columns=df_index)
-            
-def output_one_fasta(mongo_record, out_file='output.fna'):
-    with open(out_file, 'w+') as output_handle:
-        output_handle.write(
-            '>{0}|{1}\n{2}\n'.format(
-                mongo_record['species'],
-                str(mongo_record['_id']),
-                mongo_record['dna_seq'],
-                )
-            )
-
-def get_groups():
-    all_hits = kv.get_collection('hits')
-    groups_list = []
-    for h in all_hits.find():
-        current_species = h['species']
-
-        # if any([x in current_species for x in dutton_list] or [y in current_species for y in wolfe_list]):
-        current_species_islands = get_islands(h['species'])
-        
-        # each sublist represents one island...
-        for island in current_species_islands:
-            hit_set = set() # container for hits 
-            for gene_id in island:
-                gene_hits = h['hits'][gene_id[1]]
-                
-                # Pulls each hit id tuple, then appends it to group_set
-                for hit in gene_hits:
-                    hit_set.add((hit[0], hit[1]))
-            # add id tuples for hits to island list...
-            island.update(hit_set)
-            # And add new island (with multiple species) to groups_list
-            groups_list.append(list(island))
-
-    # Since each species' islands are built independently, there's a lot of redundancy
-    # So... Collapse lists that contain shared elements and deduplicate
-    return map(list, collapse_lists(groups_list))
-
-def output_groups(output_file='default', min_group_size=2):
-    if output_file == 'default':
-        output_file = 'groups.csv'.format(kv.db.name)
-        df_index = [
-            'groups',
-            'species',
-            'kvtag',
-            'contig',
-            'start',
-            'end',
-            'strand',
-            'annotation',
-            'dna_seq',
-        ]
-        df = pd.DataFrame()
-        group_no= 0
-        groups_list = get_groups()
-        groups_list.sort(key=len, reverse=True)
-
-        for group in groups_list:
-            if len(group) >= min_group_size:
-                group_no += 1
-                # Entry is `(species, id)`
-                for entry in group:
-                    db_handle = kv.get_mongo_record(*entry)
-                    
-                    annotation = db_handle['annotation'].replace(',','')
-                    series = pd.Series(
-                        [str(group_no).zfill(3),
-                        db_handle['species'],
-                        db_handle['kvtag'],
-                        db_handle['location']['contig'],
-                        db_handle['location']['start'],
-                        db_handle['location']['end'],
-                        db_handle['location']['strand'],
-                        annotation,
-                        db_handle['dna_seq']
-                        ],
-                        index=df_index,
-                        name=db_handle['kvtag']
-                    )
-                    df=df.append(series)
-        df.to_csv(output_file, columns=df_index)
-
-def output_groups_by_species(min_group_size=2):
-    all_species = kv.get_species_collections()
-    groups_list = get_groups()
-    groups_list.sort(key=len, reverse=True)
-
-    groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=[str(x+1) for x in range(0, len(groups_list))])
-
-    group_no = 0
-    for group in groups_list:
-        if len(group) >= min_group_size:
-            group_no += 1
-            species_in_group = [x[0] for x in group]
-            for species in species_in_group:
-                groups_df[species][group_no-1] = 1
-    groups_df.to_csv('groups_by_species.csv')
-
-def output_compare_matrix():
-    groups = get_groups()
-    all_species = get_tree()[1]
-    print all_species
-    
-    cds_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species)
-    nt_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species)
-    groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species)
-
-    for pair in combinations(all_species, 2):
-        print "====\nComparing {} and {}".format(pair[0], pair[1])
-        shared_cds, shared_nt = pair_compare(pair[0],pair[1])
-        shared_groups = 0
-        if kv.get_genus(pair[0]) == kv.get_genus(pair[1]):
-            print "Oops! They're the same genus... moving on\n===="
-            continue
-        elif shared_cds == 0:
-            print "Oops! Looks like they don't share anything... moving on\n===="
-            continue
-        else:
-            for group in get_groups():           
-                if [x for x in group if x[0] == pair[0] and any(y[0] == pair[1] for y in group)]:
-                    shared_groups +=1
-
-            cds_df[pair[0]][pair[1]] = cds_df[pair[1]][pair[0]] = shared_cds
-            nt_df[pair[0]][pair[1]] = nt_df[pair[1]][pair[0]] = shared_nt
-            groups_df[pair[0]][pair[1]] = groups_df[pair[1]][pair[0]] = shared_groups
-            print "shared cds: {}\nshared nt: {}\nshared groups: {}\n====".format(shared_cds, shared_nt, shared_groups)
-
-    print nt_df
-
-    cds_df.to_csv('cds_matrix.csv'.format(kv.db.name))
-    nt_df.to_csv('nt_matrix.csv'.format(kv.db.name))
-    groups_df.to_csv('groups_matrix.csv' .format(kv.db.name))
-
-def pair_compare(species_1, species_2):    
-    shared_CDS = 0
-    shared_nt = 0
-
-    s1_genes = kv.get_collection('hits').find_one({'species':species_1})
-    
-    for gene in s1_genes['hits']:
-        if s1_genes['hits'][gene]:
-            for hit in s1_genes['hits'][gene]:
-                if hit[0] == species_2:
-                    shared_CDS += 1
-                    species_2_record = kv.get_mongo_record(hit[0],hit[1])
-                    hit_loc = species_2_record['location']
-                    shared_nt += hit_loc['end'] - hit_loc['start']
-    return shared_CDS, shared_nt
-
-def get_islands(species_name):
-    islands = []
-    species_hits_list = []
-    
-    # Add mongo_record for each hit in any gene
-    all_hits = kv.get_collection('hits')
-
-    
-    for query_id in species_hits:
-        if species_hits[query_id]:
-            species_hits_list.append(
-                kv.get_mongo_record(species_name, query_id)
-                )
-
-    for entry_1 in species_hits_list:
-        entry_recorded = False
-        for entry_2 in species_hits_list:
-            if entry_1 == entry_2:
-                pass
-            elif entry_1['location']['contig'] != entry_2['location']['contig']:
-                pass
-            else:
-                location_1 = entry_1['location']
-                location_2 = entry_2['location']
-                if abs(location_1['end'] - location_2['start']) <= 5000:
-                    entry_recorded = True
-                    islands.append([
-                        (entry_1['species'], str(entry_1['_id'])),
-                        (entry_2['species'], str(entry_2['_id']))
-                    ])
-        if not entry_recorded:
-            islands.append([(entry_1['species'], str(entry_1['_id']))])
-
-    return collapse_lists(islands)
-
-if __name__ == '__main__':
-    kv.mongo_init('reorg')
-    os.chdir('/Users/KBLaptop/computation/kvasir/data/output/reorg/')
-    get_tree(True)
\ No newline at end of file
diff --git a/code/DeprecatedPhylo.py b/code/DeprecatedPhylo.py
deleted file mode 100644
index da568e7..0000000
--- a/code/DeprecatedPhylo.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# !/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import re
-import os
-import KvDataStructures as kv
-import KvasirHGT as kh
-from itertools import combinations, permutations
-from matplotlib import pyplot as plt
-from Bio import SeqIO
-from Bio.Blast import NCBIXML
-from bson.objectid import ObjectId
-from subprocess import Popen, PIPE
-import numpy as np
-import pymongo
-import plotly.plotly as py
-
-def get_16S_distance(species_1, species_2, pident=False):  
-    # print 'Aligning:', species_1, species_2
-    s1_ssu = str(kv.db['16S'].find_one({'species':species_1})['dna_seq'])
-    s2_ssu = str(kv.db['16S'].find_one({'species':species_2})['dna_seq'])
-    return get_gene_distance(s1_ssu, s2_ssu)
-
-def output_all_16S():
-    print "Making fasta of all 16S in database {}".format(kv.db.name)
-    with open('{}_16S.fna'.format(kv.db.name), 'w+') as output_handle:
-        for record in kv.get_collection('16S').find():
-            output_handle.write(
-                '>{0}\n{1}\n'.format(
-                    record['species'],
-                    record['dna_seq'],
-                    )
-                )
-
-def get_gene_distance(seq_1, seq_2):
-    query = StripedSmithWaterman(seq_1)
-    alignment = query(seq_2)
-    q = DNA(alignment.aligned_query_sequence)
-    t = DNA(alignment.aligned_target_sequence)
-    return q.distance(t)
-
-def output_distance_matrix(core=False, to_file=True):
-    all_species = kv.get_collection('core').distinct('species'))
-    if core:
-        pass
-    else:
-        all_species.extend(kv.get_collection('other').distinct('species'))
-
-    distance_matrix = pd.DataFrame(data={n:0.0 for n in all_species}, index=all_species)
-    
-    for pair in combinations_with_replacement(all_species, 2):
-        distance = get_16S_distance(pair[0], pair[1])
-        distance_matrix[pair[0]][pair[1]] = distance
-        distance_matrix[pair[1]][pair[0]] = distance
-
-    if to_file:
-        distance_matrix.to_csv('distance_matrix.csv')
-    else:
-        return distance_matrix
-
-def get_tree(core=False, newick=False):
-    all_species = kv.get_collection('core').distinct('species')
-    if core:
-        pass
-    else:
-        all_species.extend(kv.get_collection('other').distinct('species'))
-    t = tree.nj(dm)
-    print t.ascii_art()
-    tips = []
-    for node in t.tips():
-        print node.name, node.length
-        tips.append(node.name.replace(' ', '_'))
-    if newick:
-        n = tree.nj(dm, result_constructor=str)
-        print n
-    else:
-        return (t, tips)
-
-def ssu_perc_id(species_1, species_2):
-    s1_ssu = kv.db['16S'].find_one({'species':species_1})
-    s2_ssu = kv.db['16S'].find_one({'species':species_2})
-
-    s1_fasta = make_gene_fasta(s1_ssu)
-    s2_fasta = make_gene_fasta(s2_ssu)
-
-    out = Popen(
-        ['blastn',
-        '-query', s1_fasta,
-        '-subject', s2_fasta,
-        '-outfmt', '10 pident',
-        ], stdout=PIPE
-    ).communicate()[0]
-    if out:
-        result = re.split(r'[,\n]', out.rstrip())[0]
-        return result
-
-def make_species_fasta(species):
-    if not os.path.isdir('fastas'):
-        os.makedirs('fastas')    
-    fasta = 'fastas/{}.fna'.format(species)
-    if not os.path.isfile(fasta):
-        with open(fasta, 'w+') as output_handle:
-            for record in kv.get_collection(species).find():
-                output_handle.write(
-                    ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq'])
-                )
-    return fasta
-
-def make_gene_fasta(gene_record):
-    if not os.path.isdir('fastas/genes'):
-        os.makedirs('fastas/genes')    
-    fasta = 'fastas/genes/{}_{}.fna'.format(gene_record['species'], gene_record['kvtag'])
-    if not os.path.isfile(fasta):
-        with open(fasta, 'w+') as output_handle:
-            output_handle.write(
-                ">{}|{}\n{}\n".format(gene_record['species'].replace(' ', '_'), gene_record['_id'], gene_record['dna_seq'])
-            )
-    return fasta
-
-def make_indexed_fasta(species):
-    if not os.path.isdir('fastas/'):
-        os.makedirs('fastas/')
-    id_list =[]
-    indexed_species = kv.index_contigs(kv.get_collection(species))
-    indexed_species2 = kv.index_contigs(kv.get_collection(species))
-    fasta = 'fastas/{}_indexed.fna'.format(species)
-    for record in indexed_species:
-        id_list.append(record['_id'])
-    if not os.path.isfile(fasta):
-        with open(fasta, 'w+') as output_handle:
-            for record in indexed_species2:
-                output_handle.write(
-                    ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq'])
-                )
-                
-    return (fasta, id_list)
-
-def fasta_blast(species_1, species_2, word_size=28):
-    if not os.path.isdir('pairwise_blast'):
-        os.makedirs('pairwise_blast')
-
-    s1 = make_species_fasta(species_1)
-    s2 = make_species_fasta(species_2)
-    results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2)
-    
-    if not os.path.isfile('pairwise_blast/{}_blastdb.nhr'.format(species_2)):
-        Popen(
-            ['makeblastdb',
-                '-in', s2,
-                '-dbtype', 'nucl',
-                '-out', 'pairwise_blast/{}_blastdb'.format(species_2),
-                '-title', os.path.basename(species_2),
-            ]
-        ).wait()
-
-    if word_size > 7:    
-        out = Popen(
-            ['blastn',
-            '-query', s1,
-            '-db', 'pairwise_blast/{}_blastdb'.format(species_2),
-            '-outfmt', '5',
-            '-word_size', str(word_size),
-            '-out', results,
-            ],
-        ).communicate()[0]
-    else:
-        print "Word size is too small, skipping"
-        return False
-
-    blast_hits = count_records(results)
-    print "Got {} hits".format(blast_hits)
-    if blast_hits < 800:
-        if word_size > 7:
-            print "Not enough hits, trying with word size = {}".format(word_size-2)
-            os.remove(results)
-            return fasta_blast(species_1, species_2, word_size-2)
-        else:
-            print "Still not enough hits, but we're giving up trying to get to 800"
-            return results
-    elif blast_hits >= 800:
-        print "Got enough hits!"
-        return results
-    else:
-        print "something went wrong"
-        os.remove(results)
-        return False
-
-def output_loc_hist(species_1, species_2, ax):
-    s1, id_list = make_indexed_fasta(species_1)
-    s2 = make_species_fasta(species_2)
-    if not os.path.isfile('pairwise_blast/{}_blastdb.nhr'.format(species_2)):
-        Popen(
-            ['makeblastdb',
-                '-in', s2,
-                '-dbtype', 'nucl',
-                '-out', 'pairwise_blast/{}_blastdb'.format(species_2),
-                '-title', os.path.basename(species_2),
-            ]
-        ).wait()
-
-    indexed_blast = blast_one(s1, 'pairwise_blast/{}_blastdb'.format(species_2))
-    concatenated_subject = kv.concat_contigs(kv.get_collection(species_1))
-    
-    xys = []
-    last_end = 0
-    
-    for i in range(len(indexed_blast))[0::4]:
-        # print indexed_blast[i:i+4]
-
-        subject = concatenated_subject[ObjectId(kv.fasta_id_parse(indexed_blast[i])[1])]
-        query = kv.get_mongo_record(*kv.fasta_id_parse(indexed_blast[i+1]))
-        
-        x1 = subject['location']['start']
-        if x1 <= last_end:
-            x1 = last_end + 1
-
-        x2 = subject['location']['end']
-        last_end = x2
-
-        y = float(indexed_blast[i+2])
-        print [(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)]
-        xys.extend([(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)])
-
-    xys.sort(key=lambda xy:xy[0])
-    x, y = zip(*xys)
-
-    if len(x) > 200:
-        cmap = plt.get_cmap('jet')
-        color = cmap(np.random.rand())
-
-        ax.plot(x, y, marker=None, color=color, linestyle='-', label='{}'.format(species_2))
-        print "Plotting {} vs {}!".format(species_1, species_2)
-        
-        ssu_id = ssu_perc_id(species_1, species_2)
-        xmin, xmax = plt.xlim()
-        ax.plot([xmin, xmax], [ssu_id, ssu_id], color=color, label='{} 16S'.format(species_2))
-       
-    else:
-        print "Not that many hits between {} and {}, skipping!".format(species_1, species_2)
-
-    # plt.fill_between(x, 0, y, color='#B0384D', alpha=0.5)
-
-def blast_one(query, database, word_size=28):
-    out, err = Popen(
-            ['blastn',
-            '-query', query,
-            '-db', database,
-            '-num_alignments', '1',
-            '-outfmt', '10 qseqid sseqid pident length',
-            '-word_size', str(word_size),
-            ], stdout=PIPE
-        ).communicate()
-    if out:
-        result = re.split(r'[,\n]', out.rstrip())
-        return result
-    else:
-        return None
-
-def all_by_all(species_1, species_2):
-    # results = fasta_blast(species_1, species_2)
-    results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2)
-    if results:
-        with open(results, 'r') as result_handle:
-            blast_records = NCBIXML.parse(result_handle)
-            hits_list = []
-            for blast_record in blast_records:
-                qsp, qid = kv.fasta_id_parse(blast_record.query)
-                query_record = kv.get_mongo_record(qsp, qid)
-                for alignment in blast_record.alignments:
-                    asp, aid = kv.fasta_id_parse(alignment.hit_def)
-                    alignment_record = kv.get_mongo_record(asp, aid)
-                    for hsp in alignment.hsps:
-                        if hsp.align_length > 100:
-                            pident = float(hsp.positives)/float(hsp.align_length)
-                            length = hsp.align_length
-                            hits_list.append((query_record, alignment_record))
-                        break
-                    break
-            return hits_list
-    else:
-        print "Blast didn't work for some reason"
-
-def global_distance(species_1, species_2):
-    hits_list = all_by_all(species_1, species_2)
-    hits_distance = []
-    ssu_distance = None
-    for hit in hits_list:
-        distance = kh.get_gene_distance(str(hit[0]['dna_seq']), str(hit[1]['dna_seq']))
-        hits_distance.append(distance)
-        if hit[0]['type'] == '16S':
-            ssu_distance = distance
-
-    return (ssu_distance, hits_distance)
-
-def plot_global_distance(list_of_species, sp_index):
-    focus = list_of_species[sp_index]
-    list_copy = list(list_of_species)
-    list_copy.remove(focus)
-
-    ssus = []
-    median_ys = []
-    avg_ys = []
-    plt.close()
-
-    for sp in list_copy:
-        x_points = []
-        y_points = []
-        print sp
-        data = global_distance(focus, sp)
-        print data[0]
-        if data[0]:
-            for i in range(len(data[1])):
-                x_points.append(data[0])
-                y_points.append(data[1][i])
-        plt.scatter(x_points, y_points, marker='o', color='#B0384D')
-        ssus.append(data[0])
-        median_ys.append(np.median(data[1]))
-        avg_ys.append(np.average(data[1]))
-
-    plt.title(focus)
-    plt.plot([x for x in ssus], [y for y in ssus], marker='s', color='#599FA7')
-    plt.plot([x for x in ssus], [y for y in median_ys], '^m')
-    plt.xlabel("Species Distance")
-    plt.ylabel("Gene Distance")
-    plt.xlim(xmax=1.5, xmin=0)
-    plt.ylim(ymin=0)
-
-    plt.savefig('{}_distances.pdf'.format(focus))
-
-def pairwise_distance(species_1, species_2):
-    print "working on {} vs {}".format(species_1, species_2)
-    hits = all_by_all(species_1, species_2)
-    xy_points = []
-    for hit in hits:
-        x = len(hit[0]['dna_seq'])
-        y = khget_gene_distance(str(hit[0]['dna_seq']), str(hit[1]['dna_seq']))
-        xy_points.append((x,y))
-    
-
-
-    x, y = zip(*xy_points)
-    y_average = np.average(y)
-    y_std = np.std(y)
-    plt.close()
-    plt.title("{} vs {}".format(species_1, species_2))
-    plt.scatter(x, y, marker='o', color='#B0384D')
-    plt.axhline(y=y_average, color='#599FA7')
-    plt.axhline(y=y_average+y_std, color='#599FA7', ls=':')
-    plt.axhline(y=y_average-y_std, color='#599FA7', ls=':')
-
-    plt.xlabel("Gene Length (bp)")
-    plt.ylabel("Gene Distance")
-    plt.xlim(xmin=0)
-    plt.ylim(ymin=0)
-
-    plt.savefig('{}_{}_genedistance.pdf'.format(species_1, species_2))
-
-def global_hist(list_of_species, sp_index):
-    focus = list_of_species[sp_index]
-    list_copy = list(list_of_species)
-    list_copy.remove(focus)
-
-    plt.close()
-
-    for sp in list_copy:
-        print sp
-        x_points = []
-        data = global_distance(focus, sp)
-        print data[0]
-        if data[0] < 1.0:
-            data_list = []
-            for i in range(len(data[1])):
-                data_list.append(data[1][i])
-            x_points.append(data_list)
-            
-    plt.hist(x_points, histtype='bar', stacked=True)
-
-    plt.title(focus)
-    plt.xlabel("Species Distance")
-    plt.ylabel("Gene Distance")
-    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
-          ncol=4, fancybox=True)
-
-    plt.savefig('{}_hist.pdf'.format(focus))
-
-def pairwise_hist(species_1, species_2):
-    print "working on {} vs {}".format(species_1, species_2)
-    hits = all_by_all(species_1, species_2)
-    xs = []
-    for hit in hits:
-        x = khget_gene_distance(str(hit[0]['dna_seq']), str(hit[1]['dna_seq']))
-        xs.append(x)
-    
-    plt.close()
-    plt.title("{} vs {}".format(species_1, species_2))
-    plt.hist(xs, color='#B0384D',)
-    
-    plt.xlabel("Gene Distance")
-    plt.ylabel("frequency")
-    plt.xlim(xmin=0)
-    plt.ylim(ymin=0)
-
-    plt.savefig('{}_{}_distance_hist.pdf'.format(species_1, species_2))
-
-def count_records(blast_xml):
-    with open(blast_xml, 'r') as result_handle:
-        blast_records = NCBIXML.parse(result_handle)
-        counter = 0
-        for i in blast_records:
-            try:
-                i.alignments[0]
-                counter +=1
-            except IndexError:
-                pass
-        return counter
-
-def plot_many(list_of_species_pairs):
-    fig = plt.figure()
-    ypos = 1
-    for pair in list_of_species_pairs:    
-        if pair[0] == pairs[0][0]:
-            ax = fig.add_axes([1,ypos,1,1])
-            output_loc_hist(pair[0], pair[1], ax)
-
-    plot_url = py.plot_mpl(fig)
-    print plot_url
-
-    # plt.xlabel("Position")
-    # plt.ylabel("percent identity")
-    # plt.savefig('/Users/KBLaptop/Desktop/try.pdf')
-
-if __name__ == '__main__':
-    kv.mongo_init('more_genomes')
-    os.chdir('/Users/KBLaptop/computation/kvasir/data/output/more_genomes/')
-    ls = kv.get_species_collections()
-    print ls
-    ls.remove('Arthrobacter_arilaitensis_Re117')
-    pairs = []
-    for pair in combinations(ls, 2):
-        pairs.append((pair[0], pair[1]))
-    plot_many(pairs)
-
-
-    #     if os.path.isfile('{}_{}.pdf'.format(pair[0], pair[1])):
-    #         continue
-    #     try:
-    #         output_loc_hist(pair[0], pair[1])
-    #     except RuntimeError:
-    #         print "Couldn't compare {} and {}".format(pair[0], pair[1])
-
-    # test()
\ No newline at end of file
diff --git a/code/Deprecatedget_outputs.py b/code/Deprecatedget_outputs.py
deleted file mode 100644
index 3e52d34..0000000
--- a/code/Deprecatedget_outputs.py
+++ /dev/null
@@ -1,23 +0,0 @@
- #!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import os
-import sys
-import KvasirHGT as kh
-from KvDataStructures import mongo_init
-
-def get_outputs():
-    if not os.path.isdir('hgt_data/'):
-       os.makedirs('hgt_data/')
-    os.chdir('hgt_data/')
-
-    kh.output_hits_csv()
-    kh.output_groups()
-
-# if __name__ == '__main__':
-#     import sys
-#     os.chdir('/Users/KBLaptop/computation/kvasir/data/output/dutton_genomes2/')
-#     mongo_init('dutton_genomes2')
-#     get_outputs()
\ No newline at end of file
diff --git a/code/FixGbk.py b/code/FixGbk.py
deleted file mode 100644
index c5d44fc..0000000
--- a/code/FixGbk.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import os, re
-from Bio import SeqIO
-
-def confirm_species_name(file_name):
-    user_validate = str(raw_input("Is {} the correct species name? [y/n]".format(file_name[:-3])))
-    if user_validate == 'y':
-        return file_name[:-3].replace(' ', '_').replace('.', '')
-    elif user_validate == 'n':
-        print "Please use only letters for the following"
-        genus = raw_input("Please enter organism genus: ")
-        species = raw_input("Please enter organism species (use \"sp\" if unknown):")
-        print "Please use only letters, numbers and '_' for the following"
-        strain = raw_input("Please enter organism strain (use \"undef\" if unknown):")
-        if re.match(r'^[A-Za-z]+$', genus) and re.match(r'^[A-Za-z]+$', species) and re.match(r'^\w+$', strain):
-            return '{}_{}_{}'.format(genus, species, strain)
-        else:
-            "Something's wrong, try again"
-            confirm_species_name(file_name)
-    else:
-        print "Please enter \"y\" or \"n\"... try again"
-        confirm_species_name(file_name)
-
-def validate_gbk(some_genbank, user_validate=True):
-    file_name = os.path.basename(some_genbank)
-    sp_name = None
-    if user_validate:
-        sp_name = confirm_species_name(file_name)
-    else:
-        sp_name = file_name[:-3].replace(' ', '_').replace('.', '')
-    
-    if sp_name:
-        new_file_name = 'validated_gbk/{0}_validated.gb'.format(sp_name)
-        
-        if not os.path.isfile(new_file_name):
-            with open(some_genbank, 'r') as open_file:
-                
-                lt_counter = 0
-                contig_counter = 0
-                genome = SeqIO.parse(open_file, 'gb')
-                new_genome = []
-
-                for record in genome:
-                    new_genome.append(record)
-                new_genome.sort(key=len, reverse=True)
-                
-                for record in new_genome:
-                    contig_counter += 1
-                    try:
-                        source = record.annotations['source']
-                    except KeyError:
-                        record.annotations['source'] = 'None'
-                    record.annotations['comment'] = """Validated gb file for use with Kvasir HGT finder www.github.com/kescobo/kvasir
-                    Original Def: {}
-                    Original source: {}
-                    """.format(record.name, record.annotations['source'])
-                    record.name = 'kvc_{0}'.format(str(contig_counter).zfill(3))
-                    record.annotations['source'] = sp_name
-                    for feature in record.features:
-                        lt_counter += 1
-                        feature.qualifiers['kvtag'] = str(lt_counter).zfill(5)
-                
-                if not os.path.isdir('validated_gbk/'):
-                    os.makedirs('validated_gbk/')
-                with open(new_file_name, 'w+') as output_handle:
-                    SeqIO.write(new_genome, output_handle, 'gb')
-                    return os.path.abspath(new_file_name)
-        else:
-            print 'already fixed, skipping validation!'
-            return False
-
-if __name__ == '__main__':
-    os.chdir('/Users/KBLaptop/computation/tmp')
-    validate_gbk('/Users/KBLaptop/computation/genomes/arthro_IMG.gb', False)
\ No newline at end of file
diff --git a/code/KvCircos.py b/code/KvCircos.py
deleted file mode 100644
index dd1e021..0000000
--- a/code/KvCircos.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import os
-import re
-from Bio import SeqIO, SeqUtils
-from KvasirHGT import core_hgt_groups
-import numpy as np
-import KvDataStructures as kv
-import pandas as pd
-
-def get_karyotype(some_gbk):
-    """
-    Convert Genbank file into Karyotype file for Circos
-    - Each contig is a "chromosome"
-    - format: 'chr - ID LABEL START END COLOR'
-    """
-    with open (some_gbk, 'r') as file_handle:
-        contigs = []
-        sp_name = None
-        for record in SeqIO.parse(file_handle, 'gb'):
-            sp_name = kv.parse_genbank_name(some_gbk)
-            contigs.append((record.name, len(record)))
-        sp_strain = sp_name[2]
-        if not os.path.isdir('circos/karyotypes/'):
-            os.makedirs('circos/karyotypes/')
-        with open('circos/karyotypes/karyotype_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as karyotype:
-            color = [np.random.randint(0,255), np.random.randint(0,255), np.random.randint(0,255)]
-            for contig in contigs:
-                if contig[1] > 1000:
-                    karyotype.write('chr - {0}{1} {2} {3} {4} {5},{6},{7}\n'.format(
-                        sp_strain,
-                        contig[0],
-                        contig[0],
-                        '1',
-                        contig[1],
-                        *color
-                        )
-                    )
-                else:
-                    break
-
-def get_links(group=None, perc_identity='99'):
-    hits_collection = kv.get_collection('hits')
-    group_hits = None
-    if not os.path.isdir('circos/links/'):
-            os.makedirs('circos/links/')
-    out_name = 'circos/links/all_links_{}.txt'.format(perc_identity)
-    if group:
-        groups = core_hgt_groups()
-        group_hits = sorted(groups, key=len, reverse=True)[group - 1]
-        out_name = 'circos/links/group{}_links_{}.txt'.format(group, perc_identity)
-    
-    with open(out_name, 'w+') as out_handle:
-        for species in hits_collection.find():
-            print species
-            try:
-                all_hits = species['core_hits_{}'.format(perc_identity)]
-                hits_to_write = None
-                if group:
-                    hits_to_write = {gene:all_hits[gene] for gene in all_hits if (species['species'], gene) in group_hits}
-                else:
-                    hits_to_write = all_hits
-                for gene in hits_to_write:
-                    if hits_to_write[gene]:
-                        s1_record = kv.get_mongo_record(species['species'], gene)
-                        s1_strain = kv.parse_species_name(species['species'])
-                        for hit in hits_to_write[gene]:
-                            s2_record = kv.get_mongo_record(hit[0], hit[1])
-                            s2_strain = kv.parse_species_name(hit[0])
-                            out_handle.write('{0}kvc_{1} {2} {3} {4}kvc_{5} {6} {7}\n'.format(
-                                s1_strain[2],
-                                s1_record['location']['contig'],
-                                s1_record['location']['start'],
-                                s1_record['location']['end'],
-                                s2_strain[2],
-                                s2_record['location']['contig'],
-                                s2_record['location']['start'],
-                                s2_record['location']['end'],
-                                )
-                            )        
-            except KeyError:
-                pass
-
-
-def get_gc(some_gbk):
-    """
-    Get GC content for contigs in genbank file, output file for circs line plot
-
-    """    
-    with open (some_gbk, 'r') as file_handle:
-        gc_points = []
-        sp_name = None
-        for record in SeqIO.parse(file_handle, 'gb'):
-            for b in range(len(record))[500::1000]:
-                gc_cont = SeqUtils.GC(record.seq[b-500:b+499])
-                gc_points.append((record.name, b-500, b+499, gc_cont))
-            
-            sp_name = kv.parse_genbank_name(some_gbk)
-        sp_strain = sp_name[2]
-        with open('circos/GC/gc_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as out_handle:
-            values = [x[3] for x in gc_points]
-            stats = (min(values), max(values), np.average(values), np.std(values))
-            out_handle.write("# Min: {}\n# Max: {}\n# Avg, Std: {}, {}\n".format(
-                stats[0], stats[1], stats[2], stats[3]
-                )
-            )
-            for point in gc_points:
-                out_handle.write('{0}{1} {2} {3} {4}\n'.format(
-                    sp_strain,
-                    point[0],
-                    point[1],
-                    point[2],
-                    point[3]
-                    )
-                )
-            return stats
-
-def gc_circos(gb_folder):
-    if not os.path.isdir('circos/GC/'):
-            os.makedirs('circos/GC/')
-    with open('circos/GC/plots.conf', 'w+') as out_handle:
-        out_handle.write('<plots>\n')
-        for f in os.listdir(gb_folder):
-            if f.endswith('.gb'):
-                print f
-                stats = get_gc('{}{}'.format(gb_folder, f))
-                out_handle.write("""    <plot>
-    show  = yes
-    type  = line
-
-    file  = GC/gc_{}.txt
-    r1    = 0.95r
-    r0    = 1.05r
-    max   = {}
-    min   = {}
-
-    color     = grey
-    thickness = 1
-
-    <rules>
-    <rule>
-    condition    = var(value) > {}
-    color = dgreen
-    </rule>
-    <rule>
-    condition    = var(value) < {}
-    color = dred
-    </rule>
-    </rules>
-    </plot>\n
-    """.format(f[:-13], stats[1], stats[0], stats[2]+stats[3], stats[2]-stats[3])
-                )
-        out_handle.write('</plots>\n')
-
-if __name__ == '__main__':
-    os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
-    kv.mongo_init('pacbio2')
-
-    # if not os.path.isdir('circos'):
-    #     os.makedirs('circos')
-    # for f in os.listdir('validated_gbk/'):
-    #     if f.endswith('.gb'):
-    #         print f
-    #         get_karyotype('validated_gbk/{}'.format(f))
-
-    get_links(perc_identity='95')
-    get_links(perc_identity='90')
-
-    # gc_circos('validated_gbk/')
-
-    # for entry in kv.get_collection('core').distinct('species'):
-    #     print 'karyotypes/karyotype_{}.txt;'.format(entry)
-        
-    
\ No newline at end of file
diff --git a/code/KvDataStructures.py b/code/KvDataStructures.py
deleted file mode 100644
index 9118a29..0000000
--- a/code/KvDataStructures.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-'''Functions for interacting with information from MongoDB'''
-
-from pymongo import MongoClient, ASCENDING, DESCENDING
-from bson.objectid import ObjectId
-import re
-
-
-class mongo_iter(object):
-    """**DEPRECATED** Iterator that steps through species in mongoDB. Call with:
-    `for current_species_collection in mongo_iter(mongo_db_name):`"""
-    def __init__(self):
-        self.index = -1
-        self.collections = get_species_collections()
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        if self.index == len(self.collections) - 1:
-            raise StopIteration
-        else:
-            self.index += 1
-            return get_collection(self.collections[self.index])
-                
-client = MongoClient()
-db = None
-
-def parse_genbank_name(some_genbank):
-    parsed = re.search(r"([A-Za-z]+)_([A-Za-z]+)_([\w-]+)_validated\.gb", some_genbank)
-    return (parsed.group(1), parsed.group(2), parsed.group(3))
-
-def parse_species_name(sp_name):
-    parsed = re.search(r"^([A-Za-z]+)_([A-Za-z]+)_([\w-]+)$", sp_name)
-    return (parsed.group(1), parsed.group(2), parsed.group(3))
-
-def get_gene_location(location):
-    '''
-    Takes BioPython location object eg. `[1030:1460](-)` and extracts integer values.
-    Returns (start, stop, direction)
-    '''
-    location_parse = re.search(r'\[<?(\d+)\:>?(\d+)\]\((\+|-)\)', str(location))
-    if location_parse.group(3) == '+':
-        direction = 1
-    elif location_parse.group(3) == '-':
-        direction = -1
-    return (int(location_parse.group(1)), int(location_parse.group(2)), direction)
-
-def mongo_init(mongo_db_name):
-    global db
-    db = client[mongo_db_name]
-    return db.name
-
-def get_collections():
-    return db.collection_names(False)
-
-def get_collection(collection):
-    return db[collection]
-
-def remove_collection(collection):
-    print get_collections()
-    db.drop_collection(collection)
-
-def reset_database(database):
-    mongo_init(database)
-    print "Now you see it:"
-    print db.collection_names(False)
-    for collection in db.collection_names(False):
-        db.drop_collection(collection)
-    print "Now you don't!"
-    print db.collection_names(False)
-
-def get_species_collections():
-    collections = db.collection_names(False)
-    if '16S' in collections:
-        collections.remove('16S')
-    if 'hits' in collections:
-        collections.remove('hits')
-    return collections
-
-def get_mongo_record(species, mongo_id, db='core'):
-    return get_collection(db).find_one({'species':species, '_id':ObjectId(mongo_id)})
-
-def get_genus(species_name):
-    return re.search(r'^(\w+)', species_name).group(1)
-
-def view_record():
-    for item in get_species_collections():
-        print item
-        counter = 1
-        for collection in get_collection(item).find():
-            counter += 1
-            if counter > 3:
-                break
-
-def fasta_id_parse(fasta_id):
-    """"species_name|_id" -> (species_name, _id)"""
-    parsed = re.search(r'(\w+)\|(\w+)', fasta_id)
-    return (parsed.group(1), parsed.group(2))
-
-def index_contigs(species, collection='core'):
-    curr_collection = get_collection(collection)
-    return curr_collectioncollection.find({'species':species}).sort([("location.contig", ASCENDING), ("location.start", ASCENDING)])
-
-def concat_contigs(species, collection='core'):
-    current_contig = 0
-    last_contig_end = 0
-    last_gene_end = 0
-    concatenated = {}
-    for gene in index_contigs(species, collection):
-        if gene['species'] == species:
-            if gene['location']['contig'] == current_contig:
-                gene['location']['start'] += last_contig_end
-                gene['location']['end'] += last_contig_end
-                last_gene_end = gene['location']['end']
-                
-                concatenated[gene['_id']] = gene
-            else:
-                current_contig = gene['location']['contig']
-                last_contig_end = last_gene_end
-
-                gene['location']['start'] += last_contig_end
-                gene['location']['end'] += last_contig_end
-                last_gene_end = gene['location']['end']
-            
-                concatenated[gene['_id']] = gene
-
-    return concatenated
-
-def make_gene_fasta(gene_record, to_file=True):
-    entry = ">{}|{}\n{}\n".format(gene_record['species'].replace(' ', '_'), gene_record['_id'], gene_record['dna_seq'])
-
-    if to_file:
-        fasta = '{}_{}.fna'.format(gene_record['species'], gene_record['kvtag'])
-        with open(fasta, 'w+') as output_handle:
-            output_handle.write(entry)
-        return fasta
-    else:
-        return entry
-
-def make_seq_fasta(seq, name='temp'):
-    fasta = '{}.fasta'.format(name)
-    with open(fasta, 'w+') as output_handle:
-        output_handle.write(
-                ">{}\n{}\n".format(name, seq)
-            )
-    return fasta
-
-def make_id_list_fasta(id_list, db):
-    """
-    Make fasta from list of `(species, _id)` tuples
-    """
-    genes = []
-    for gene_id in id_list:
-        genes.append(make_gene_fasta(get_mongo_record(*gene_id, db=db), to_file=False))
-    with open('tmp.fna', 'w+') as output_handle:
-        for gene in genes:
-            output_handle.write(gene)
-
-        
-
-if __name__ == '__main__':
-    test = 'validated_genbank/Arthrobacter_arilaitensis_3M03_validated.gb'
-    print parse_genbank_name(test)
\ No newline at end of file
diff --git a/code/KvasirBlast.py b/code/KvasirBlast.py
deleted file mode 100644
index 95e263d..0000000
--- a/code/KvasirBlast.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-'''
-Must have Mongod running, in terminal: `mongod --dbpath path/to/db`
-'''
-
-import os, re
-from subprocess import Popen, PIPE
-from Bio.Blast import NCBIXML
-from Bio.Blast.Applications import NcbiblastnCommandline
-from bson.objectid import ObjectId
-from pymongo.cursor import Cursor
-from KvasirHGT import core_hgt_groups
-import KvDataStructures as kv
-
-def make_blast_db(source, name=None, remove_source=True):
-    """
-    Produces BLAST database from `source`
-    Optional - provide name (defaults to `source`)
-    Set remove_source=False to keep fasta file (if created)
-    
-    Source types:
-    - fasta file (use path, must end with `.fna`)
-    - Mongo collection (use name of collection)
-    - list of dicts containing at least keys `species`, `_id`, `dna_seq`
-    - Mongo cursor eg. `collection.find({'key':value})`
-    """
-    # If there's no directory for blast db's, create one
-    if not os.path.isdir('blast_databases/'):
-        os.makedirs('blast_databases/')
-    
-    output_fasta = None
-    
-    if os.path.isfile(source):
-        # Input is fasta file?
-        if source.endswith('.fna'):
-            output_fasta = source
-            if not name:
-                name = os.path.basename(source)[:-4]
-            remove_source = False
-        else:
-            print "Not a valid file type, use .fna"
-
-    else:
-        output_fasta = '{0}_all.fasta'.format(kv.db.name)     
-        genes = None
-        with open(output_fasta, 'w+') as output_handle:
-            if source in kv.get_collections():
-                genes = kv.get_collection(source).find()
-                if not name:
-                    name = source
-            elif type(source) == list:
-                genes = source
-            elif type(source) == Cursor:
-                genes = source
-        
-            for gene in genes:
-                output_handle.write('>{0}|{1}\n{2}\n'.format(
-                    gene['species'],
-                    gene['_id'],
-                    gene['dna_seq'],
-                    )
-                )
-
-    while not name:
-        name = str(raw_input("enter name for BLAST database: "))
-
-    # calls makeblastdb from shell
-    print "making a database!"
-    Popen(
-        ['makeblastdb',
-        '-in', output_fasta,
-        '-dbtype', 'nucl',
-        '-out', 'blast_databases/{0}'.format(name),
-        '-title', name,
-        ]
-    ).wait() # waits for this operation to terminate before moving on
-
-    if remove_source:
-        os.remove(output_fasta)
-
-def blast_vs_fasta(query, subject):
-    """
-    Blast `query` against `subject`. Both must be paths to fasta file
-    Returns list of lists, each `[sseqid, qseqid, pident, length]`   
-    """
-    out = Popen(
-        ['blastn',
-        '-query', query,
-        '-subject', subject,
-        '-outfmt', '10 sseqid qseqid pident length',
-        ], stdout=PIPE
-    ).communicate()[0]
-    if out:
-        result = re.split(r'[,\n]', out.rstrip())[0]
-        return [result[i:i+4] for i in range(len(result))[0::4]]
-
-def blast_vs_db(query, db):
-    """
-    Blast `subject` (fasta file) against `db` (blast db). 
-    Returns list of lists, each `[qseqid, sseqid, pident, length]`   
-    """
-    out = Popen(
-        ['blastn',
-        '-query', query,
-        '-db', db,
-        '-outfmt', '10 qseqid sseqid pident length',
-        ], stdout=PIPE
-    ).communicate()[0]
-    if out:
-        result = re.split(r'[,\n]', out.rstrip())
-        return [result[i:i+4] for i in range(len(result))[0::4]]
-
-def core_hgt_blast(perc_identity='99'):
-    """
-    Blasts all core genomes against core db
-    - Set `perc_identity` if desired (default = 99)
-    """
-    if not os.path.isdir('blast_results/core/'):
-        os.makedirs('blast_results/core/')
-    for species in kv.get_collection('core').distinct('species'):
-        query_fasta = 'blast_results/core/{}_tmp.fna'.format(species)
-        
-        with open(query_fasta, 'w+') as query_handle:
-            for query in kv.get_collection('core').find({'species':species}):
-                if query['type'] == 'gene':
-                    query_handle.write('>{0}|{1}\n{2}\n'.format(
-                        query['species'],
-                        query['_id'],
-                        query['dna_seq']
-                        )
-                    )
-        
-        print 'Blasting {0}'.format(species)
-        out = Popen(
-            ['blastn',
-            '-query', query_fasta,
-            '-db', 'blast_databases/core',
-            '-outfmt', '5',
-            '-out', 'blast_results/core/{}_{}_blast.xml'.format(species, perc_identity),
-            '-perc_identity', perc_identity
-            ],
-            stdout=PIPE
-        ).communicate()[0]
-
-        os.remove(query_fasta)
-
-def blast_to_db(db='core', perc_identity='99'):
-    blast_dir = 'blast_results/{}/'.format(db)
-    for f in os.listdir(blast_dir):
-        if f.endswith('{}_blast.xml'.format(perc_identity)):
-            file_handle = 'blast_results/{}/{}'.format(db,f)
-            with open(file_handle, 'r') as result_handle:
-                blast_records = NCBIXML.parse(result_handle)
-                hits_dict = {}
-                for blast_record in blast_records:
-                    query_parse = re.search(r'(\w+)\|(\w+)', blast_record.query)
-                    query_genus_parse = re.match(r'([A-Za-z]+)_', blast_record.query)
-                    query_genus = query_genus_parse.group(1)
-                    query_name = query_parse.group(1)
-                    query_id = query_parse.group(2)
-
-                    hits_dict[query_id] = []
-
-                    for alignment in blast_record.alignments:
-                        hit_parse = re.search(r'(\w+)\|(\w+)', alignment.hit_def)
-                        hit_genus_parse = re.match(r'([A-Za-z]+)_', alignment.hit_def)
-                        hit_genus = hit_genus_parse.group(1)
-
-                        hit_name = hit_parse.group(1)
-                        hit_id = hit_parse.group(2)
-                        if query_name == hit_name:
-                            pass
-                        elif query_genus == hit_genus:
-                            print "Oops! {} and {} are the same genus, skipping...".format(query_name, hit_name)
-                            pass
-                        elif kv.get_mongo_record(hit_name, hit_id)['type'] == '16S':
-                            print 'Skipping 16S hit'
-                        else:
-                            print '=======\nhit for {0} detected:\nspecies: {1}\n======='.format(query_name, hit_name)
-                            hits_dict[query_id].append((hit_name, hit_id))
-                    
-                print 'Updataing mongoDB with hits'
-                hits_collection = kv.get_collection('hits')
-                hits_collection.update_one(
-                    {'species':query_name},
-                    {'$set':{'{}_hits_{}'.format(db, perc_identity):{x:hits_dict[x] for x in hits_dict if hits_dict[x]}}},
-                    upsert=True
-                    ) 
-
-def hits_reset():
-    kv.remove_collection('hits')
-
-def other_blast():
-    groups_list = core_hgt_groups()
-    groups_list.sort(key=len, reverse=True)
-    
-    for i in range(len(groups_list)):
-        group_hits = []
-        kv.make_id_list_fasta(groups_list[i], 'core')
-        results = blast_vs_db('tmp.fna', 'blast_databases/other')
-
-        hits_collection = kv.get_collection('hits')
-        if results:
-            for j in range(len(results)):
-                group_hits.append(results[j])
-        hits_collection.insert_one({'group':(i+1), 'group_hits':group_hits})
-
-
-if __name__ == '__main__':
-    import sys
-    kv.mongo_init('pacbio2')
-    os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
-    # kv.mongo_init(sys.argv[1])
-    # os.chdir('output/{}/'.format(sys.argv[1]))
-    make_blast_db('core')
-    make_blast_db('other')
-    hits_reset()
-    hgt_blast(perc_identity='90')
-    hgt_blast(perc_identity='95')
-    hgt_blast(perc_identity='99')
-    blast_to_db(perc_identity='90')
-    blast_to_db(perc_identity='95')
-    blast_to_db(perc_identity='99')
-    
-    other_blast()
diff --git a/code/KvasirHGT.py b/code/KvasirHGT.py
deleted file mode 100644
index 5caa730..0000000
--- a/code/KvasirHGT.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import KvDataStructures as kv
-import pandas as pd
-
-def core_hgt_stats(perc_identity='99'):
-    """
-    Returns stats of HGT (number of events etc)
-    """
-    collection = kv.get_collection('core')
-    df_index = ['Total_CDS', 'HGT_CDS', 'Islands']
-    df = pd.DataFrame()
-    for species in collection.distinct('species'):
-        hits = kv.get_collection('hits').find_one({'species':species})['core_hits_{}'.format(perc_identity)]
-        series = pd.Series([
-            sum([1 for x in collection.find({'species':species})]),
-            sum([1 for x in hits if hits[x]]),
-            len(get_islands(species))
-        ], name=species, index=df_index)
-        df = df.append(series)
-
-    df.to_csv('stats.csv',  columns=df_index)
-
-def collapse_lists(list_of_lists):
-    """
-    Reduces list of lists by combining lists with identical elements
-    **Example input**: [[1,2,3],[3,4],[5,6,7],[1,8,9,10],[11],[11,12],[13],[5,12]]
-    **Example output: [[1,2,3,4,8,9,10],[5,6,7,11,12],[13]]
-
-    from stackoverflow user YXD: http://stackoverflow.com/questions/30917226/collapse-list-of-lists-to-eliminate-redundancy
-    """
-    result = []
-    for l in list_of_lists:
-        s = set(l)
-
-        matched = [s]
-        unmatched = []
-        # first divide into matching and non-matching groups
-        for g in result:
-            if s & g:
-                matched.append(g)
-            else:
-                unmatched.append(g)
-        # then merge matching groups
-        result = unmatched + [set().union(*matched)]
-    return result
-
-def get_islands(species_name, perc_identity='99'):
-    """
-    For each species, combines HGT hits co-occurring within 5kb of eachother
-    Returns list of lists of `(species, _id)` tuples
-    """
-        
-    islands = []
-    species_hits_list = []
-    
-    # Add mongo_record for each hit in any gene
-    all_hits = kv.get_collection('hits')
-    species_hits = all_hits.find_one({'species':species_name})['core_hits_{}'.format(perc_identity)]
-
-    
-    for query_id in species_hits:
-        if species_hits[query_id]:
-            species_hits_list.append(
-                kv.get_mongo_record(species_name, query_id)
-                )
-
-    for entry_1 in species_hits_list:
-        entry_recorded = False
-        for entry_2 in species_hits_list:
-            if entry_1 == entry_2:
-                pass
-            elif entry_1['location']['contig'] != entry_2['location']['contig']:
-                pass
-            else:
-                location_1 = entry_1['location']
-                location_2 = entry_2['location']
-                if abs(location_1['end'] - location_2['start']) <= 5000:
-                    entry_recorded = True
-                    islands.append([
-                        (entry_1['species'], str(entry_1['_id'])),
-                        (entry_2['species'], str(entry_2[ '_id']))
-                    ])
-        if not entry_recorded:
-            islands.append([(entry_1['species'], str(entry_1['_id']))])
-
-    return collapse_lists(islands)
-
-def core_hgt_groups(perc_identity='99'):
-    """
-    Returns mutilspecies groups of genes as list of lists.
-    
-    - Starts with species islands `get_islands()`
-    - For each island, if a hit from that island is in another island,
-      group them together
-    """
-    all_hits = kv.get_collection('hits')
-    groups_list = []
-    for s in all_hits.distinct('species'):
-        s_hits = all_hits.find_one({'species':s})
-        current_species_islands = get_islands(s_hits['species'])
-        
-        # each sublist represents one island...
-        for island in current_species_islands:
-            if island: # many lists are empty, skip those
-                hit_set = set() # container for hits 
-                for gene_id in island:
-                    gene_hits = s_hits['core_hits_{}'.format(perc_identity)][gene_id[1]]
-                    
-                    # Pulls each hit id tuple, then appends it to group_set
-                    for hit in gene_hits:
-                        hit_set.add((hit[0], hit[1]))
-                # add id tuples for hits to island list...
-                island.update(hit_set)
-                # And add new island (with multiple species) to groups_list
-                groups_list.append(list(island))
-
-    # Since each species' islands are built independently, there's a lot of redundancy
-    # So... Collapse lists that contain shared elements and deduplicate
-    return map(list, collapse_lists(groups_list))
-
-def output_groups(min_group_size=2):
-    """
-    Returns .csv file with information for each CDS in an HGT group
-
-    - Optional: set minimum number of CDS to be considered a group
-    """ 
-    output_file = '{}_groups.csv'.format(kv.db.name)
-    df_index = ['group','kvtag','contig','start','end','strand','annotation','dna_seq']
-    df = pd.DataFrame()
-    group_no= 0
-    groups_list = core_hgt_groups()
-    groups_list.sort(key=len, reverse=True)
-
-    for group in groups_list:
-        if len(group) >= min_group_size:
-            group.sort(key=lambda entry:entry[0])
-            group_no += 1
-            for entry in group: # Entry is `(species, id)`
-                
-                db_handle = kv.get_mongo_record(*entry)
-                annotation = db_handle['annotation'].replace(',','') # prevents CSV screw-up
-                series = pd.Series(
-                    [str(group_no).zfill(3),
-                    db_handle['kvtag'],
-                    db_handle['location']['contig'],
-                    db_handle['location']['start'],
-                    db_handle['location']['end'],
-                    db_handle['location']['strand'],
-                    annotation,
-                    db_handle['dna_seq']
-                    ],
-                    index=df_index,
-                    name=db_handle['species']
-                )
-                df=df.append(series)
-    df.to_csv(output_file, columns=df_index)
-
-def group_hits(core=False):
-    all_species = kv.get_collection('core').distinct('species')
-    if not core:
-        all_species.extend(kv.get_collection('other').distinct('species'))
-    
-
-    hits_db = kv.get_collection('hits')
-    species_index = sorted(all_species)
-    print species_index
-    df = pd.DataFrame()
-    core_groups = sorted(core_hgt_groups(), key=len, reverse=True)
-
-
-    for group in sorted(hits_db.distinct('group')):
-        recorded = []
-        s = {sp:0.0 for sp in species_index}
-        for hit in core_groups[group-1]:
-            if not hit in recorded:
-                s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq'])
-                recorded.append(hit)
-        
-        for hit in hits_db.find_one({'group':group})['group_hits']:
-            if float(hit[2]) > 90 and float(hit[3]) > 100:
-                if hit[1] not in recorded:
-                    s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100
-                    recorded.append(hit[1])
-                
-        s = pd.Series(s, name='group_{}'.format(group))
-        df['group_{}'.format(group)] = s
-
-    df.to_csv('group_hits_other.csv')
-
-# if __name__ == '__main__':
-#     import os
-#     kv.mongo_init('pacbio2')
-#     os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
-#     # group_hits(core=True)
-#     # output_groups()
-#     # core_hgt_stats()
-#     output_hits_csv()
\ No newline at end of file
diff --git a/code/KvasirPhylo.py b/code/KvasirPhylo.py
deleted file mode 100644
index d4b6ef0..0000000
--- a/code/KvasirPhylo.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import os
-
-import KvDataStructures as kv
-import pandas as pd
-
-from itertools import combinations_with_replacement
-from subprocess import Popen, PIPE
-from skbio import DNA
-from skbio import DistanceMatrix
-from skbio import tree
-from skbio.alignment import StripedSmithWaterman
-
-def get_gene_distance(seq_1, seq_2):
-    """
-    Returns hamming distance between two DNA sequences
-    Alignment based on Striped Smith-Waterman algorithm
-    """
-    query = StripedSmithWaterman(seq_1.upper())
-    alignment = query(seq_2.upper())
-    q = DNA(alignment.aligned_query_sequence)
-    t = DNA(alignment.aligned_target_sequence)
-    return q.distance(t)
-
-def get_perc_id(seq_1, seq_2):
-    return 1.0 - get_gene_distance(seq_1, seq_2)
-
-def get_16S_distance(species_1, species_2):  
-    """
-    Returns hamming distance of the 16S seq of two species in MongoDB
-    """
-    if species_1 == species_2:
-        return 0.0
-    else:
-        s1_ssu = kv.db['16S'].find_one({'species':species_1})
-        s2_ssu = kv.db['16S'].find_one({'species':species_2})
-        if s1_ssu and s2_ssu:
-            if len(s1_ssu['dna_seq']) > 800 and len(s1_ssu['dna_seq']) > 800:
-                return get_gene_distance(str(s1_ssu['dna_seq']), str(s2_ssu['dna_seq']))
-            else:
-                return None
-        else:
-            return None
-
-def get_distance_matrix(core=False, to_file=True):
-    all_species = kv.get_collection('core').distinct('species')
-    if core:
-        pass
-    else:
-        all_species.extend(kv.get_collection('other').distinct('species'))
-
-    ssu_species = [n for n  in all_species if kv.db['16S'].find_one({'species':n})]
-    distance_matrix = pd.DataFrame(data={n:0.0 for n in ssu_species}, index=ssu_species, columns=ssu_species)
-    
-    for pair in combinations_with_replacement(ssu_species, 2):
-        distance = get_16S_distance(pair[0], pair[1])
-        if distance:
-            distance_matrix[pair[0]][pair[1]] = distance
-            distance_matrix[pair[1]][pair[0]] = distance
-
-    if to_file:
-        distance_matrix.to_csv('distance_matrix.csv')
-    else:
-        return distance_matrix
-
-def get_tree(core=False, newick=False):
-    core_collection = kv.get_collection('core')
-    all_species = core_collection.distinct('species')
-    if core:
-        pass
-    else:
-        other_collection = kv.get_collection('other')
-        all_species.extend(other_collection.distinct('species'))
-    ssu_species = [n for n  in all_species if kv.db['16S'].find_one({'species':n})]
-    
-    dm = DistanceMatrix(get_distance_matrix(core=core, to_file=False), ssu_species)
-    t = tree.nj(dm)
-    print t.ascii_art()
-    tips = []
-    for node in t.tips():
-        print node.name, node.length
-        tips.append(node.name.replace(' ', '_'))
-    if newick:
-        n = tree.nj(dm, result_constructor=str)
-        print n
-    else:
-        return (t, tips)
-
-def ssu_fasta():
-    with open('16s.fna', 'w+') as out_handle:
-        for species in kv.get_collection('16S').distinct('species'):
-            ssu = kv.get_collection('16S').find_one({'species':species})
-            if ssu:
-                out_handle.write(kv.make_gene_fasta(ssu, to_file=False))
-            else:
-                print species
-
-def add_ssu(supp_file):
-    # df = pd.read_csv(supp_file)
-    # print df.columns
-    # new_df = pd.DataFrame()
-    # # for i in range(len(df['Strain'])):
-    #     # print df['Strain'][i].replace(' ', '_').replace('.', '')
-
-    # strain = pd.Series([df['Strain'][i].replace(' ', '_').replace('.', '') for i in range(len(df['Strain']))], name='strain')
-    # ssus = df['sequences of the 16s rRNA genes']
-    
-    # ssu = pd.Series([ssus[i].replace(r'\n', '') if not pd.isnull(ssus[i]) else None for i in range(len(ssus))], name='16S')
-    # new_df['strain'] = strain
-    # new_df['16S'] = ssu
-
-    # new_df.to_csv('ssu.csv')
-
-    ssu_df = pd.read_csv(supp_file)
-    for i in range(len(ssu_df['strain'])):
-        print ssu_df['strain'][i], ssu_df['16S'][i]
-        if not pd.isnull(ssu_df['16S'][i]):
-            gene_record = {
-                'species':ssu_df['strain'][i],
-                'location':{
-                    'contig':None,
-                    'start':None,
-                    'end':None,
-                    'strand':None,
-                },
-                'annotation':'Small subunit ribosomal RNA',
-                'dna_seq':ssu_df['16S'][i],
-                'kvtag':None,
-                'type':'16S'
-                }
-
-            print "adding 16S gene!"
-            kv.get_collection('16S').remove({'species':ssu_df['strain'][i]})
-            print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]})
-            kv.get_collection('16S').insert_one(gene_record)
-            print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]})
-
-def get_list_from_tree(newick_file):
-    pass
-
-
-if __name__ == '__main__':
-    os.chdir('/Users/KBLaptop/computation/kvasir/data/output/reorg/')
-    kv.mongo_init('reorg')
diff --git a/code/reset_db.py b/code/reset_db.py
deleted file mode 100644
index 1177214..0000000
--- a/code/reset_db.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-import sys
-import KvDataStructures as kv
-
-kv.mongo_init(sys.argv[1])
-kv.reset_database(kv.db.name)
\ No newline at end of file
diff --git a/code/run_kvasir.py b/code/run_kvasir.py
deleted file mode 100644
index 9ad72a8..0000000
--- a/code/run_kvasir.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env python
-# by Kevin Bonham, PhD (2015)
-# for Dutton Lab, Harvard Center for Systems Biology, Cambridge MA
-# Unless otherwise indicated, licensed under GNU Public License (GPLv3)
-
-'''
-- Create directories `path/input/` and `path/output/`
-- Put genbank files into `path/input/core` or `path/input/other`
-- Must have Mongod running, in terminal: `mongod --dbpath path/to/db`
-- run `python run_kvasir.py [run_name]` from `path/`
-'''
-
-import os
-import sys
-import datetime
-import DataImport, FixGbk, KvasirBlast
-from KvasirHGT import output_groups
-from KvDataStructures import mongo_init, reset_database
-
-print 'Here we go!'
-
-core_gbk_folder = os.path.abspath('data/input/core/')
-other_gbk_folder =os.path.abspath('data/input/other/')
-exp_name = datetime.datetime.now().strftime('%Y%m%d%H%M')
-
-mongo_init(exp_name)
-reset_database(exp_name)    
-
-new_folder = 'data/output/{0}/'.format(exp_name)
-if not os.path.isdir(new_folder):
-   os.makedirs(new_folder)
-os.chdir(new_folder)
-
-print 'Checking and importing core genomes...'
-
-for the_file in os.listdir(core_gbk_folder):
-    path_to_file = '{0}/{1}'.format(core_gbk_folder, the_file)
-    DataImport.import_file(path_to_file, 'core')
-
-print 'Checking and importing other genomes...'
-
-for the_file in os.listdir(other_gbk_folder):
-    path_to_file = '{0}/{1}'.format(other_gbk_folder, the_file)
-    DataImport.import_file(path_to_file, 'other')
-    
-
-KvasirBlast.make_blast_db('core')
-KvasirBlast.make_blast_db('other')
-KvasirBlast.core_hgt_blast(perc_identity='90')
-# KvasirBlast.core_hgt_blast(perc_identity='95')
-# KvasirBlast.core_hgt_blast(perc_identity='99')
-KvasirBlast.blast_to_db(perc_identity='90')
-# KvasirBlast.blast_to_db(perc_identity='95')
-# KvasirBlast.blast_to_db(perc_identity='99')
-output_groups()
-
diff --git a/requirements.txt b/requirements.txt
index 365f886..2cf807d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
 pymongo
-Biopython
\ No newline at end of file
+Bio
diff --git a/run.py b/run.py
index 58ba82f..3c36619 100644
--- a/run.py
+++ b/run.py
@@ -1,4 +1,6 @@
 from settings import *
+import sys
+sys.path.append('src/')
 from DataImport.mongo_import import mongo_import_genbank
 from FindHGT.make_blast_db import make_blast_db, db_cds_to_fna
 from FindHGT.run_blast import blast_all, parse_blast_results_xml
@@ -18,36 +20,35 @@ def blast_db():
     if not os.path.isdir(db_path):
         os.makedirs(db_path)
 
-    make_blast_db(fasta.name, "nucl", os.path.join(db_path, "genes"))  # Collection name option? (see ln9 above)
+    make_blast_db(fasta.name, "nucl", os.path.join(db_path, "genes"))  # Collection name option? (see ln10 above)
 
 
 def blast():
-    fasta = db_cds_to_fna('genes')  # Collection name option? (see ln9 above)
+    fasta = db_cds_to_fna('genes')  # Collection name option? (see ln10 above)
     db_path = os.path.join(OUTPUT, MONGODB.name, "blast_db", "genes")
 
     blast_results = blast_all(fasta, db_path)
     parse_blast_results_xml(blast_results)
 
 
-def analyze():
-    groups = output.hgt_groups(0.99, minimum_length=500, dist_between_hits=5000)
-    output.output_groups(groups, os.path.join(OUTPUT, "99-500-5000-groups-test.csv"))
-    # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000)
-    # output.output_groups(groups, os.path.join(OUTPUT, "99-400-5000-groups.csv"))
-    # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000)
-    # output.output_groups(groups, os.path.join(OUTPUT, "99-00-5000-groups.csv"))
-    # groups = output.hgt_groups(0.99, minimum_length=300, dist_between_hits=5000)
-    # output.output_groups(groups, os.path.join(OUTPUT, "99-300-5000-groups.csv"))
-    # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000)
-    # output.output_groups(groups, os.path.join(OUTPUT, "99-400-5000-groups.csv"))
-    # groups = output.hgt_groups(0.99, minimum_length=400, dist_between_hits=5000)
-    # output.output_groups(groups, os.path.join(OUTPUT, "99-400-5000-groups.csv"))
-
+def analyze(minimum_identity, minimum_length=500, dist_between_hits=5000):
+    groups = output.hgt_groups(minimum_identity, minimum_length, dist_between_hits)
+    output.output_groups(
+        groups, os.path.join(
+            OUTPUT, "{}-{}-{}-groups.csv".format(
+                int((minimum_identity)*100), minimum_length, dist_between_hits
+                )
+            )
+        )
 
+"""
+WIP
+"""
 def run_circos():
+    pass
     # circos.get_karyotypes()
-    circos.get_links(0.99, 500, group_no=2, min_dist=3000, annotation="transposase")
-    circos.get_links(0.99, 500, group_no=1, min_dist=3000, annotation="transposase")
+    # circos.get_links(0.99, 500, group_no=2, min_dist=3000, annotation="transposase")
+    # circos.get_links(0.99, 500, group_no=1, min_dist=3000, annotation="transposase")
     # circos.get_links(0.90, 400, 0.95)
     # circos.get_gc_conf()
     # circos.get_conf_file("path/to/output/database_name/circos/links/99-400-links.txt",
diff --git a/settings.py b/settings.py
index 0ab289c..772f0a7 100644
--- a/settings.py
+++ b/settings.py
@@ -1,5 +1,5 @@
 import pymongo
 
 INPUT = "/path/to/input/"  # path to folder with genbank files
-OUTPUT = "/Users/KBLaptop/computation/kv_data/output/database_name/"  # path to output folder
+OUTPUT = "/path/to/output"  # path to output folder
 MONGODB = pymongo.MongoClient()["database_name"]
diff --git a/src/Analysis/output.py b/src/Analysis/output.py
index 6fe3d7b..a53e234 100644
--- a/src/Analysis/output.py
+++ b/src/Analysis/output.py
@@ -107,11 +107,11 @@ def find_all_hits(some_id, minimum_identity, minimum_length=100):
     as_query = {"type": "blast_result", "query": str(some_id)}  # maybe should use ObjectId on import, see issue#10
     as_subject = {"type": "blast_result", "subject": str(some_id)}
 
-        blast_hits = collection.find({"$or": [as_query, as_subject],
-                                 "perc_identity": {"$gte": minimum_identity},
-                                 "length":        {"$gte": minimum_length}
-                                  }
-                             )  # will evaluate None if no pair is found
+    blast_hits = collection.find({"$or": [as_query, as_subject],
+                             "perc_identity": {"$gte": minimum_identity},
+                             "length":        {"$gte": minimum_length}
+                              }
+                         )  # will evaluate None if no pair is found
 
     # we only need to get the hit back
     if blast_hits:
diff --git a/test.txt b/test.txt
deleted file mode 100644
index e69de29..0000000