Skip to content

Commit

Permalink
updates to v0.3.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer authored and Jon Palmer committed Jul 1, 2016
1 parent f5a89d7 commit bfea9f0
Show file tree
Hide file tree
Showing 8 changed files with 300 additions and 50 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ To see the help menu, simply type `funannotate` in the terminal window. Similar
$ funannotate
Usage: funannotate <command> <arguments>
version: 0.1.4
version: 0.3.2
Description: Funannotate is a genome prediction, annotation, and comparison pipeline.
Expand Down
192 changes: 161 additions & 31 deletions bin/funannotate-predict.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/ubuntu_install.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ brew doctor
brew tap homebrew/dupes
brew tap homebrew/science
brew tap nextgenusfs/tap
brew update
```

4) Install funannotate and dependencies using LinuxBrew
Expand Down
3 changes: 2 additions & 1 deletion funannotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ def fmtcols(mylist, cols):
--genemark_mod GeneMark ini mod file.
--protein_evidence Proteins to map to genome (prot1.fa,prot2.fa,uniprot.fa). Default: uniprot.fa
--transcript_evidence mRNA/ESTs to align to genome (trans1.fa,ests.fa,trinity.fa). Default: none
--busco_seed_species Augustus pre-trained species to start BUSCO. Default: generic
--busco_seed_species Augustus pre-trained species to start BUSCO. Default: anidulans
--optimize_augustus Run 'optimze_augustus.pl' to refine training (long runtime)
--busco_db BUSCO models. Default: fungi [fungi,vertebrata,metazoa,eukaryota,arthropoda]
--organism Fungal-specific options. Default: fungus. [fungus,other]
Expand Down
37 changes: 22 additions & 15 deletions lib/library.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import division
import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing
import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools
from natsort import natsorted
import warnings
from Bio import SeqIO
Expand Down Expand Up @@ -74,6 +74,9 @@ def readBlocks(source, pattern):
buffer.append( line )
yield buffer

def empty_line_sep(line):
return line=='\n'

def get_parent_dir(directory):
return os.path.dirname(directory)

Expand Down Expand Up @@ -1732,31 +1735,35 @@ def getTrainResults(input):
values3 = line.split('|') #get [6] and [7]
return (values1[1], values1[2], values2[6], values2[7], values3[6], values3[7])

def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus):
def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, optimize):
RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl')
OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl')
NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')
aug_cpus = '--cpus='+str(cpus)
species = '--species='+train_species
aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log')
trainingdir = 'tmp_opt_'+train_species
with open(aug_log, 'w') as logfile:
subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 100 models for testing purposes
if not CheckAugustusSpecies(train_species): #check if training set exists, if not run etraining
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
if not CheckAugustusSpecies(train_species):
subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile)
#run etraining again to only use best models from EVM for training
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 200 models for testing purposes
with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining:
subprocess.call(['augustus', species, trainingset+'.test'], stdout=initialtraining)
train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'))
log.info('Initial training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
#now run optimization
subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile)
#run etraining again
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining:
subprocess.call(['augustus', species, os.path.join(trainingdir, 'bucket1.gb')], stdout=finaltraining)
train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'))
log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
#clean up tmp folder
shutil.rmtree(trainingdir)
if optimize:
#now run optimization
subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile)
#run etraining again
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining:
subprocess.call(['augustus', species, trainingset+'.test'], stdout=finaltraining)
train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'))
log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
#clean up tmp folder
shutil.rmtree(trainingdir)

HEADER = '''
<!DOCTYPE html>
Expand Down
62 changes: 62 additions & 0 deletions util/filter_buscos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python

#script to reformat Augustus BUSCO results
import sys, os, itertools

if len(sys.argv) < 4:
print("Usage: filter_buscos.py busco.evm.gff3 full_table_species busco.final.gff3")
sys.exit(1)

def group_separator(line):
return line=='\n'

#parse the busco table into dictionary format
busco_complete = {}
with open(sys.argv[2], 'rU') as buscoinput:
for line in buscoinput:
cols = line.split('\t')
if cols[1] == 'Complete':
ID = cols[2].replace('evm.model.', '')
if not ID in busco_complete:
busco_complete[ID] = (cols[0], cols[3])
else:
score = busco_complete.get(ID)[1]
if float(cols[3]) > float(score):
busco_complete[ID] = (cols[0], cols[3])
print ID, 'updated dictionary'
else:
print ID, 'is repeated and score is less'

#now parse the evm busco file, group them
results = []
with open(sys.argv[1]) as f:
for key, group in itertools.groupby(f, group_separator):
if not key:
results.append(list(group))

#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name
'''
scaffold_1 EVM gene 18407 18947 . - . ID=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1
scaffold_1 EVM mRNA 18407 18947 . - . ID=evm.model.scaffold_1.1;Parent=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1
scaffold_1 EVM exon 18772 18947 . - . ID=evm.model.scaffold_1.1.exon1;Parent=evm.model.scaffold_1.1
scaffold_1 EVM CDS 18772 18947 . - 0 ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1
scaffold_1 EVM exon 18407 18615 . - . ID=evm.model.scaffold_1.1.exon2;Parent=evm.model.scaffold_1.1
scaffold_1 EVM CDS 18407 18615 . - 1 ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1
'''
counter = 0
with open(sys.argv[3], 'w') as output:
for i in results:
counter += 1
cols = i[0].split('\t')
ID = cols[8].split(';')[0]
ID = ID.replace('ID=', '')
lookup = ID.replace('evm.TU.', '')
if lookup in busco_complete:
name = busco_complete.get(lookup)[0]
geneID = 'gene'+str(counter)
mrnaID = 'mrna'+str(counter)
newblock = ''.join(i)
newblock = newblock.replace('EVM%20prediction%20'+lookup, name)
newblock = newblock.replace(ID, geneID)
newblock = newblock.replace('evm.model.'+lookup, mrnaID)
output.write(newblock+'\n')
47 changes: 47 additions & 0 deletions util/fix_busco_naming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python

#script to reformat Augustus BUSCO results
import sys, os, itertools

if len(sys.argv) < 4:
print("Usage: fix_busco_naming.py busco_augustus.gff3 full_table_species busco_augustus.fixed.gff3")
sys.exit(1)

def group_separator(line):
return line=='\n'

#parse the busco table into dictionary format
busco_complete = {}
with open(sys.argv[2], 'rU') as buscoinput:
for line in buscoinput:
cols = line.split('\t')
if cols[1] == 'Complete':
if not cols[0] in busco_complete:
busco_complete[cols[0]] = cols[2]+':'+cols[3]+'-'+cols[4]

#now parse the augustus input file where gene numbers are likely repeated.
results = []
with open(sys.argv[1]) as f:
for key, group in itertools.groupby(f, group_separator):
if not key:
results.append(list(group))

#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name
counter = 0
inverse_busco = {v: k for k, v in busco_complete.items()}
with open(sys.argv[3], 'w') as output:
for i in results:
counter += 1
cols = i[0].split('\t')
lookup = cols[0]+':'+cols[3]+'-'+cols[4]
if lookup in inverse_busco:
name = inverse_busco.get(lookup)
else:
name = 'unknown_model'
ID = cols[8].split(';')[0]
ID = ID.replace('ID=', '')
newID = 'gene'+str(counter)
newblock = ''.join(i)
newblock = newblock.replace('Augustus%20prediction', name)
newblock = newblock.replace(ID, newID)
output.write(newblock+'\n')
6 changes: 4 additions & 2 deletions util/funannotate-BUSCO.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,7 @@ def measuring (nested):
scaff = seq_id.strip(']').split('[')[1].split(':')[0]
scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]

scaff_end = scaff_end.replace(']', '')
out.write('%s\tComplete\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
csc[entity] = seq_id

Expand All @@ -1106,6 +1106,7 @@ def measuring (nested):
scaff = seq_id.strip(']').split('[')[1].split(':')[0]
scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
scaff_end = scaff_end.replace(']', '')
out.write('%s\tDuplicated\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))


Expand All @@ -1122,6 +1123,7 @@ def measuring (nested):
scaff = seq_id.strip(']').split('[')[1].split(':')[0]
scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
scaff_end = scaff_end.replace(']', '')
out.write('%s\tFragmented\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))

missing = []
Expand Down Expand Up @@ -1227,7 +1229,7 @@ def measuring (nested):
gff_file.write(line)
gff_file.close()

subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 1000 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' %
subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 500 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' %
(mainout, entry, args['genome'], mainout, entry), shell = True)
#bacteria clade needs to be flagged as "prokaryotic"
NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')
Expand Down

0 comments on commit bfea9f0

Please sign in to comment.