updates to v0.3.1

nextgenusfs · Jun 20, 2016 · 66433ee · 66433ee
1 parent 021da94
commit 66433ee
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 56 deletions.
diff --git a/bin/augustus_parallel.py b/bin/augustus_parallel.py
@@ -48,9 +48,9 @@ def runAugustus(Input):
     aug_out = os.path.join(tmpdir, Input+'.augustus.gff3')   
     with open(aug_out, 'w') as output:
         if args.hints:
-            subprocess.call(['augustus', species, hints_input, extrinsic, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr= FNULL)
+            subprocess.call(['augustus', species, hints_input, extrinsic, '--gff3=on', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr= FNULL)
         else:
-            subprocess.call(['augustus', species, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr = FNULL)
+            subprocess.call(['augustus', species, '--gff3=on', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr = FNULL)
 
 
 #first step is to split input fasta file into individual files in tmp folder

diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import sys, os, subprocess, inspect, multiprocessing, shutil, argparse, time
+import sys, os, subprocess, inspect, multiprocessing, shutil, argparse, time, re
 from Bio import SeqIO
 currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
 parentdir = os.path.dirname(currentdir)
@@ -35,7 +35,7 @@ def __init__(self,prog):
 parser.add_argument('--rna_bam', help='BAM (sorted) of RNAseq aligned to reference for BRAKER1')
 parser.add_argument('--min_intronlen', default=10, help='Minimum intron length for gene models')
 parser.add_argument('--max_intronlen', default=3000, help='Maximum intron length for gene models')
-parser.add_argument('--min_protlen', default=51, type=int, help='Minimum amino acid length for valid gene model')
+parser.add_argument('--min_protlen', default=50, type=int, help='Minimum amino acid length for valid gene model')
 parser.add_argument('--keep_no_stops', action='store_true', help='Keep gene models without valid stop codons')
 parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use')
 parser.add_argument('--busco_seed_species', default='anidulans', help='Augustus species to use as initial training point for BUSCO')
@@ -448,37 +448,74 @@ def __init__(self,prog):
                 subprocess.call(['perl', Converter, aug_out], stdout = output, stderr = FNULL)
 
     if not GeneMark:
+        #count contigs
+        num_contigs = lib.countfasta(MaskGenome)
         #now run GeneMark-ES, first check for gmhmm mod file, use if available otherwise run ES
         if not args.genemark_mod:
-            GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
-            if not os.path.isfile(GeneMarkGFF3):
-                if args.organism == 'fungus':
-                    lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
-                else:
-                    lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)
-            GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
-            with open(GeneMarkTemp, 'w') as output:
-                subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
-            GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
-            with open(GeneMark, 'w') as output:
-                with open(GeneMarkTemp, 'rU') as input:
-                    lines = input.read().replace("Augustus","GeneMark")
-                    output.write(lines)
+            #if there are less than 2 data points (contigs, self-training fails), count contigs
+            if num_contigs < 2:
+                lib.log.error("GeneMark-ES cannot run with only a single contig, you must provide --ini_mod file to run GeneMark")
+            else:
+                GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
+                if not os.path.isfile(GeneMarkGFF3):
+                    if args.organism == 'fungus':
+                        lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
+                    else:
+                        lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)
+                GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
+                with open(GeneMarkTemp, 'w') as output:
+                    subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
+                GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
+                with open(GeneMark, 'w') as output:
+                    with open(GeneMarkTemp, 'rU') as input:
+                        lines = input.read().replace("Augustus","GeneMark")
+                        output.write(lines)
         else:   #have training parameters file, so just run genemark with
             GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
-            if not os.path.isfile(GeneMarkGFF3):
-                if args.organism == 'fungus':
-                    lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
-                else:
-                    lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)                  
-            GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
-            with open(GeneMarkTemp, 'w') as output:
-                subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
-            GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
-            with open(GeneMark, 'w') as output:
-                with open(GeneMarkTemp, 'rU') as input:
-                    lines = input.read().replace("Augustus","GeneMark")
-                    output.write(lines)
+            if num_contigs < 2: #now can run modified prediction on single contig
+                with open(MaskGenome, 'rU') as genome:
+                    for line in genome:
+                        if line.startswith('>'):
+                            header = line.replace('>', '')
+                            header = header.replace('\n', '')
+                GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
+                GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
+                if not os.path.isfile(GeneMarkGFF3):
+                    lib.log.info("Running GeneMark on single-contig assembly")
+                    subprocess.call(['gmhmme3', '-m', args.genemark_mod, '-o', GeneMarkGFF3, '-f', 'gff3', MaskGenome])
+                #now open output and reformat
+                lib.log.info("Converting GeneMark GTF file to GFF3")
+                with open(GeneMarkTemp, 'w') as geneout:
+                    with open(GeneMarkGFF3, 'rU') as genein:
+                        for line in genein:
+                            if not line.startswith('#'):
+                                if not '\tIntron\t' in line:
+                                    newline = line.replace('seq', header)
+                                    newline = newline.replace('.hmm3', '')
+                                    geneout.write(newline)
+                GeneMarkTemp2 = os.path.join(args.out, 'predict_misc', 'genemark.temp2.gff')
+                with open(GeneMarkTemp2, 'w') as output:
+                    subprocess.call(['perl', Converter, GeneMarkTemp], stdout = output, stderr = FNULL)                    
+                with open(GeneMark, 'w') as output:
+                    with open(GeneMarkTemp2, 'rU') as input:
+                        lines = input.read().replace("Augustus","GeneMark")
+                        output.write(lines)
+
+            else:
+                GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
+                if not os.path.isfile(GeneMarkGFF3):
+                    if args.organism == 'fungus':
+                        lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
+                    else:
+                        lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)                  
+                GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
+                with open(GeneMarkTemp, 'w') as output:
+                    subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
+                GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
+                with open(GeneMark, 'w') as output:
+                    with open(GeneMarkTemp, 'rU') as input:
+                        lines = input.read().replace("Augustus","GeneMark")
+                        output.write(lines)
 
     if not Augustus: 
         if not args.augustus_species:
@@ -542,28 +579,70 @@ def __init__(self,prog):
     if GM_check < 3:
         gmc = 0
         lib.log.error("GeneMark predictions failed, proceeding with only Augustus")
+
+    #if hints used for Augustus, get high quality models > 80% coverage to pass to EVM
+    if os.path.isfile(hints_all):
+        lib.log.info("Pulling out high quality Augustus predictions")
+        hiQ_models = []
+        with open(aug_out, 'rU') as augustus:
+            for pred in lib.readBlocks(augustus, '# start gene'):
+                values = []
+                geneID = ''
+                support = ''
+                if pred[0].startswith('# This output'):
+                    continue
+                if pred[0].startswith('##gff-version 3'):
+                    continue
+                for line in pred:
+                    line = line.replace('\n', '')
+                    if line.startswith('# start gene'):
+                        geneID = line.split(' ')[-1]
+                        values.append(geneID)
+                    if line.startswith('# % of transcript supported by hints'):
+                        support = line.split(' ')[-1]
+                        values.append(support)
+                if float(values[1]) > 65: #greater than ~66% of exons supported, i.e. 2/3 or 3/4, but not less
+                    hiQ_models.append(values[0])
+
+        #now open evm augustus and pull out models
+        HiQ = set(hiQ_models)
+        lib.log.info("Found %i high quality predictions from Augustus (>66%% exon evidence)"  % len(HiQ))
+        HiQ_match = re.compile(r'\b(?:%s)[\.t1;]+\b' % '|'.join(HiQ))
+        AugustusHiQ = os.path.join(args.out, 'predict_misc', 'augustus-HiQ.evm.gff3')
+        with open(AugustusHiQ, 'w') as HiQ_out:
+            with open(Augustus, 'rU') as evm_aug:
+                for line in evm_aug:
+                    if HiQ_match.search(line):
+                        newline = line.replace('\tAugustus\t', '\tHiQ\t')
+                        HiQ_out.write(newline)
+
 
     #EVM related input tasks, find all predictions and concatenate together
     if args.pasa_gff:
-        pred_in = [Augustus, GeneMark, args.pasa_gff]
+        if os.path.isfile(hints_all):
+            pred_in = [Augustus, GeneMark, args.pasa_gff, AugustusHiQ]
+        else:
+            pred_in = [Augustus, GeneMark, args.pasa_gff]
     else:
-        pred_in = [Augustus, GeneMark]
+        if os.path.isfile(hints_all):
+            pred_in = [Augustus, GeneMark, AugustusHiQ]
+        else:
+            pred_in = [Augustus, GeneMark]
     Predictions = os.path.join(args.out, 'predict_misc', 'predictions.gff3')
     with open(Predictions, 'w') as output:
         for f in pred_in:
             with open(f) as input:
                 output.write(input.read())
 
-    #set Weights file dependent on which data is present.  I have yet to find an example of where Augustus outperforms GeneMark for fungi, but i don't have too much evidence to think that genemark is perfect either....
+    #set Weights file dependent on which data is present.
     Weights = os.path.join(args.out, 'predict_misc', 'weights.evm.txt')
     with open(Weights, 'w') as output:
+        output.write("ABINITIO_PREDICTION\tAugustus\t1\n")
+        output.write("ABINITIO_PREDICTION\tGeneMark\t1\n")
+        if os.path.isfile(hints_all):
+            output.write("OTHER_PREDICTION\tHiQ\t10\n")
         if args.pasa_gff:
             output.write("OTHER_PREDICTION\ttransdecoder\t10\n")
-            output.write("ABINITIO_PREDICTION\tAugustus\t1\n")
-            output.write("ABINITIO_PREDICTION\tGeneMark\t1\n")
-        else:
-            output.write("ABINITIO_PREDICTION\tAugustus\t1\n")
-            output.write("ABINITIO_PREDICTION\tGeneMark\t1\n")
         if exonerate_out:
             output.write("PROTEIN\texonerate\t1\n")
         if Transcripts:
@@ -607,6 +686,12 @@ def __init__(self,prog):
 else:
     lib.log.info('{0:,}'.format(total) + ' total gene models from EVM')
 
+#move EVM folder to predict folder
+if os.path.isdir('EVM_tmp'):
+    if os.path.isdir(os.path.join(args.out, 'predict_misc', 'EVM')):
+        shutil.rmtree(os.path.join(args.out, 'predict_misc', 'EVM'))
+    os.rename('EVM_tmp', os.path.join(args.out, 'predict_misc', 'EVM'))
+
 #run tRNAscan
 lib.log.info("Predicting tRNAs")
 tRNAscan = os.path.join(args.out, 'predict_misc', 'trnascan.gff3')
@@ -631,7 +716,7 @@ def __init__(self,prog):
 lib.log.info('{0:,}'.format(total) + ' total gene models')
 
 #filter bad models
-lib.log.info("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args.min_protlen - 1))
+lib.log.info("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args.min_protlen))
 Blast_rep_remove = os.path.join(args.out, 'predict_misc', 'repeat.gene.models.txt')
 if not os.path.isfile(Blast_rep_remove):
     lib.RepeatBlast(GAG_proteins, args.cpus, 1e-10, os.path.join(args.out, 'predict_misc'), Blast_rep_remove)

diff --git a/bin/funannotate-runEVM.py b/bin/funannotate-runEVM.py
@@ -93,6 +93,8 @@ def safe_run(*args, **kwargs):
     x = num_lines
 else:
     x = cpus - 1
+if x < 1:
+    x = 1
 lib.log.info("Running EVM commands with %i CPUs" % (x))
 #print "Splitting over", cpus, "CPUs"
 n = int(round(num_lines / x))
@@ -133,4 +135,4 @@ def safe_run(*args, **kwargs):
                     shutil.copyfileobj(readfile, out)
 
 #remove your mess
-shutil.rmtree(tmpdir)
+#shutil.rmtree(tmpdir)
diff --git a/funannotate.py b/funannotate.py
@@ -31,7 +31,7 @@ def fmtcols(mylist, cols):
              for i in range(0,num_lines))
     return "\n".join(lines)
 
-version = '0.3.0'
+version = '0.3.1'
 
 default_help = """
 Usage:       funannotate <command> <arguments>

diff --git a/lib/library.py b/lib/library.py
@@ -64,6 +64,16 @@ def checkInternet():
     except urllib2.URLError as err: pass
     return False
 
+def readBlocks(source, pattern):
+    buffer = []
+    for line in source:
+        if line.startswith(pattern):
+            if buffer: yield buffer
+            buffer = [ line ]
+        else:
+            buffer.append( line )
+    yield buffer
+
 def get_parent_dir(directory):
     return os.path.dirname(directory)
 
@@ -890,26 +900,40 @@ def RemoveBadModels(proteins, gff, length, repeats, BlastResults, tmpdir, output
     remove = [w.replace('evm.TU.','') for w in remove]
     remove = [w.replace('evm.model.','') for w in remove]
     remove = set(remove)
-    remove_match = re.compile(r'\b(?:%s)[\.;]+\b' % '|'.join(remove))
-    with open(output, 'w') as out:
-        with open(os.path.join(tmpdir, 'bad_models.gff'), 'w') as out2:
+    if len(remove) > 0:
+        remove_match = re.compile(r'\b(?:%s)[\.;]+\b' % '|'.join(remove))
+        with open(output, 'w') as out:
+            with open(os.path.join(tmpdir, 'bad_models.gff'), 'w') as out2:
+                with open(gff, 'rU') as GFF:
+                    for line in GFF:
+                        if '\tstart_codon\t' in line:
+                            continue
+                        if '\tstop_codon\t' in line:
+                            continue
+                        if not remove_match.search(line):
+                            line = re.sub(';Name=.*$', ';', line) #remove the Name attribute as it sticks around in GBK file
+                            out.write(line)           
+                        else:
+                            if "\tgene\t" in line:
+                                bad_ninth = line.split('ID=')[-1]
+                                bad_ID = bad_ninth.split(";")[0]                
+                                bad_reason = reason.get(bad_ID)
+                                if bad_reason:
+                                    line = line.replace('\n', ';'+bad_reason+'\n')
+                                else:
+                                    line = line.replace('\n', ';remove_reason=unknown;\n')
+                            out2.write(line)
+    else: #if nothing to remove, just print out GFF
+        with open(output, 'w') as out:
             with open(gff, 'rU') as GFF:
                 for line in GFF:
                     if '\tstart_codon\t' in line:
                         continue
                     if '\tstop_codon\t' in line:
                         continue
-                    if not remove_match.search(line):
-                        line = re.sub(';Name=.*$', ';', line) #remove the Name attribute as it sticks around in GBK file
-                        out.write(line)           
-                    else:
-                        if "\tgene\t" in line:
-                            bad_ninth = line.split('ID=')[-1]
-                            bad_ID = bad_ninth.split(";")[0]                
-                            bad_reason = reason.get(bad_ID)
-                            line = line.replace('\n', ';'+bad_reason+'\n')
-                        out2.write(line)
-
+                    line = re.sub(';Name=.*$', ';', line) #remove the Name attribute as it sticks around in GBK file
+                    out.write(line)
+
 def CleantRNAtbl(GFF, TBL, output):
     #clean up genbank tbl file from gag output
     #try to read through GFF file, make dictionary of tRNA genes and products