updates to v0.3.2

nextgenusfs · Jul 1, 2016 · bfea9f0 · bfea9f0
1 parent f5a89d7
commit bfea9f0
Show file tree

Hide file tree

Showing 8 changed files with 300 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ To see the help menu, simply type `funannotate` in the terminal window.  Similar
 $  funannotate
 
 Usage:       funannotate <command> <arguments>
-version:     0.1.4
+version:     0.3.2
 
 Description: Funannotate is a genome prediction, annotation, and comparison pipeline.
     

diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py
diff --git a/docs/ubuntu_install.md b/docs/ubuntu_install.md
@@ -43,6 +43,7 @@ brew doctor
 brew tap homebrew/dupes
 brew tap homebrew/science
 brew tap nextgenusfs/tap
+brew update
 ```
 
 4) Install funannotate and dependencies using LinuxBrew

diff --git a/funannotate.py b/funannotate.py
@@ -129,7 +129,8 @@ def fmtcols(mylist, cols):
            --genemark_mod         GeneMark ini mod file.
            --protein_evidence     Proteins to map to genome (prot1.fa,prot2.fa,uniprot.fa). Default: uniprot.fa
            --transcript_evidence  mRNA/ESTs to align to genome (trans1.fa,ests.fa,trinity.fa). Default: none
-           --busco_seed_species   Augustus pre-trained species to start BUSCO. Default: generic
+           --busco_seed_species   Augustus pre-trained species to start BUSCO. Default: anidulans
+           --optimize_augustus    Run 'optimze_augustus.pl' to refine training (long runtime)
            --busco_db             BUSCO models. Default: fungi [fungi,vertebrata,metazoa,eukaryota,arthropoda]
            --organism             Fungal-specific options. Default: fungus. [fungus,other] 
            

diff --git a/lib/library.py b/lib/library.py
@@ -1,5 +1,5 @@
 from __future__ import division
-import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing
+import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools
 from natsort import natsorted
 import warnings
 from Bio import SeqIO
@@ -74,6 +74,9 @@ def readBlocks(source, pattern):
             buffer.append( line )
     yield buffer
 
+def empty_line_sep(line):
+    return line=='\n'
+
 def get_parent_dir(directory):
     return os.path.dirname(directory)
 
@@ -1732,31 +1735,35 @@ def getTrainResults(input):
                 values3 = line.split('|') #get [6] and [7]
         return (values1[1], values1[2], values2[6], values2[7], values3[6], values3[7])
 
-def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus):
+def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, optimize):
     RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl')
     OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl')
+    NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')
     aug_cpus = '--cpus='+str(cpus)
     species = '--species='+train_species
     aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log')
     trainingdir = 'tmp_opt_'+train_species
     with open(aug_log, 'w') as logfile:
-        subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 100 models for testing purposes
-        if not CheckAugustusSpecies(train_species): #check if training set exists, if not run etraining
-            subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
+        if not CheckAugustusSpecies(train_species):
+            subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile)
+        #run etraining again to only use best models from EVM for training
+        subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
+        subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 200 models for testing purposes
         with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining:
             subprocess.call(['augustus', species, trainingset+'.test'], stdout=initialtraining)
         train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'))
         log.info('Initial training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
-        #now run optimization
-        subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile)
-        #run etraining again
-        subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
-        with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining:
-            subprocess.call(['augustus', species, os.path.join(trainingdir, 'bucket1.gb')], stdout=finaltraining)
-        train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'))
-        log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
-        #clean up tmp folder
-        shutil.rmtree(trainingdir)
+        if optimize:
+            #now run optimization
+            subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile)
+            #run etraining again
+            subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
+            with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining:
+                subprocess.call(['augustus', species, trainingset+'.test'], stdout=finaltraining)
+            train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'))
+            log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
+            #clean up tmp folder
+            shutil.rmtree(trainingdir)
 
 HEADER = '''
 <!DOCTYPE html>

diff --git a/util/filter_buscos.py b/util/filter_buscos.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+#script to reformat Augustus BUSCO results
+import sys, os, itertools
+
+if len(sys.argv) < 4:
+    print("Usage: filter_buscos.py busco.evm.gff3 full_table_species busco.final.gff3")
+    sys.exit(1)
+
+def group_separator(line):
+    return line=='\n'
+
+#parse the busco table into dictionary format
+busco_complete = {}
+with open(sys.argv[2], 'rU') as buscoinput:
+    for line in buscoinput:
+        cols = line.split('\t')
+        if cols[1] == 'Complete':
+            ID = cols[2].replace('evm.model.', '')
+            if not ID in busco_complete:
+                busco_complete[ID] = (cols[0], cols[3])
+            else:
+                score = busco_complete.get(ID)[1]
+                if float(cols[3]) > float(score):
+                    busco_complete[ID] = (cols[0], cols[3])
+                    print ID, 'updated dictionary'
+                else:
+                    print ID, 'is repeated and score is less'
+
+#now parse the evm busco file, group them
+results = []
+with open(sys.argv[1]) as f:
+    for key, group in itertools.groupby(f, group_separator):
+        if not key:
+            results.append(list(group))
+
+#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name
+'''
+scaffold_1	EVM	gene	18407	18947	.	-	.	ID=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1
+scaffold_1	EVM	mRNA	18407	18947	.	-	.	ID=evm.model.scaffold_1.1;Parent=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1
+scaffold_1	EVM	exon	18772	18947	.	-	.	ID=evm.model.scaffold_1.1.exon1;Parent=evm.model.scaffold_1.1
+scaffold_1	EVM	CDS	18772	18947	.	-	0	ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1
+scaffold_1	EVM	exon	18407	18615	.	-	.	ID=evm.model.scaffold_1.1.exon2;Parent=evm.model.scaffold_1.1
+scaffold_1	EVM	CDS	18407	18615	.	-	1	ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1
+'''
+counter = 0        
+with open(sys.argv[3], 'w') as output:
+    for i in results:
+        counter += 1
+        cols = i[0].split('\t')
+        ID = cols[8].split(';')[0]
+        ID = ID.replace('ID=', '')
+        lookup = ID.replace('evm.TU.', '')
+        if lookup in busco_complete:
+            name = busco_complete.get(lookup)[0]
+            geneID = 'gene'+str(counter)
+            mrnaID = 'mrna'+str(counter)
+            newblock = ''.join(i)
+            newblock = newblock.replace('EVM%20prediction%20'+lookup, name)
+            newblock = newblock.replace(ID, geneID)
+            newblock = newblock.replace('evm.model.'+lookup, mrnaID)
+            output.write(newblock+'\n')
diff --git a/util/fix_busco_naming.py b/util/fix_busco_naming.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+#script to reformat Augustus BUSCO results
+import sys, os, itertools
+
+if len(sys.argv) < 4:
+    print("Usage: fix_busco_naming.py busco_augustus.gff3 full_table_species busco_augustus.fixed.gff3")
+    sys.exit(1)
+
+def group_separator(line):
+    return line=='\n'
+
+#parse the busco table into dictionary format
+busco_complete = {}
+with open(sys.argv[2], 'rU') as buscoinput:
+    for line in buscoinput:
+        cols = line.split('\t')
+        if cols[1] == 'Complete':
+            if not cols[0] in busco_complete:
+                busco_complete[cols[0]] = cols[2]+':'+cols[3]+'-'+cols[4]
+
+#now parse the augustus input file where gene numbers are likely repeated.
+results = []
+with open(sys.argv[1]) as f:
+    for key, group in itertools.groupby(f, group_separator):
+        if not key:
+            results.append(list(group))
+
+#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name
+counter = 0        
+inverse_busco = {v: k for k, v in busco_complete.items()}
+with open(sys.argv[3], 'w') as output:
+    for i in results:
+        counter += 1
+        cols = i[0].split('\t')
+        lookup = cols[0]+':'+cols[3]+'-'+cols[4]
+        if lookup in inverse_busco:
+            name = inverse_busco.get(lookup)
+        else:
+            name = 'unknown_model'
+        ID = cols[8].split(';')[0]
+        ID = ID.replace('ID=', '')
+        newID = 'gene'+str(counter)
+        newblock = ''.join(i)
+        newblock = newblock.replace('Augustus%20prediction', name)
+        newblock = newblock.replace(ID, newID)
+        output.write(newblock+'\n')
diff --git a/util/funannotate-BUSCO.py b/util/funannotate-BUSCO.py
@@ -1089,7 +1089,7 @@ def measuring (nested):
                 scaff = seq_id.strip(']').split('[')[1].split(':')[0]
                 scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
                 scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
-
+                scaff_end = scaff_end.replace(']', '')       
                 out.write('%s\tComplete\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
                 csc[entity] = seq_id
 
@@ -1106,6 +1106,7 @@ def measuring (nested):
                 scaff = seq_id.strip(']').split('[')[1].split(':')[0]
                 scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
                 scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
+                scaff_end = scaff_end.replace(']', '')
                 out.write('%s\tDuplicated\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
 
 
@@ -1122,6 +1123,7 @@ def measuring (nested):
                     scaff = seq_id.strip(']').split('[')[1].split(':')[0]
                     scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
                     scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
+                    scaff_end = scaff_end.replace(']', '')
                     out.write('%s\tFragmented\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
 
     missing = []
@@ -1227,7 +1229,7 @@ def measuring (nested):
             gff_file.write(line)
     gff_file.close()
 
-    subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 1000 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' %
+    subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 500 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' %
                     (mainout, entry, args['genome'], mainout, entry), shell = True)
   #bacteria clade needs to be flagged as "prokaryotic"
   NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')