Merge pull request #937 from nextgenusfs/util_gbk2parts_add

Util gbk2parts add cds dump
nextgenusfs · Jul 19, 2023 · 667e55c · 667e55c
2 parents eac3691 + b7cd660
commit 667e55c
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 20 deletions.
diff --git a/funannotate/library.py b/funannotate/library.py
@@ -24,6 +24,7 @@
 import textwrap
 import errno
 import datetime
+import traceback
 from natsort import natsorted
 import funannotate.resources as resources
 from funannotate.interlap import InterLap
@@ -2145,7 +2146,10 @@ def RevComp(s):
     s = s.upper()
     for i in range(0, n):
         c = s[n - i - 1]
-        cseq += rev_comp_lib[c]
+        if c not in rev_comp_lib:
+            sys.stderr.write(f'Reverse complement of {c} failing on {s} len(s) = {n}\n')
+        else:
+            cseq += rev_comp_lib[c]
     return cseq
 
 
@@ -2373,10 +2377,7 @@ def _sortDict(d):
                         Transcript = str(v["transcript"][i])
                     except IndexError:
                         sys.stderr.write(
-                            "Index Error retriving transcript {}: ({}, {})\n".format(
-                                i, k, v
-                            )
-                        )
+                            f"Index Error retriving transcript {i}: ({k}, {v})\n")
                     if v["strand"] == "-":
                         Transcript = RevComp(Transcript)
                     tranout.write(">%s %s\n%s\n" % (x, k, softwrap(Transcript)))
@@ -2712,12 +2713,13 @@ def dict2nucleotides2(input, prots, trans, cdstrans):
                         except IndexError:
                             pass
                         try:
-                            CDStranscript = str(v["cds_transcript"][i])
-                            if v["strand"] == "-":
-                                CDStranscript = RevComp(CDStranscript)
-                            cdsout.write(
-                                ">{:} {:}\n{:}\n".format(x, k, softwrap(CDStranscript))
-                            )
+                            cds = v["cds_transcript"][i]
+                            if cds and len(cds) > 0:
+                                CDStranscript  = str(cds)
+                                if v["strand"] == "-":
+                                    CDStranscript = RevComp(CDStranscript)
+                                cdsout.write(
+                                    ">{:} {:}\n{:}\n".format(x, k, softwrap(CDStranscript)))
                         except IndexError:
                             pass
                         if v["type"] == "mRNA":
@@ -4033,7 +4035,7 @@ def gb2gffnuc(input, gff, prots, trans, dna):
     return len(genes)
 
 
-def gb2parts(input, tbl, gff, prots, trans, dna):
+def gb2parts(input, tbl, gff, prots, trans, cds, dna):
     """
     function returns a dictionary of all gene models from a genbank file this function
     can handle multiple transcripts per locus/gene
@@ -4062,7 +4064,7 @@ def gb2parts(input, tbl, gff, prots, trans, dna):
     # write gff3 output
     dict2gff3_old(genes, gff)
     # write to protein and transcripts
-    dict2nucleotides(genes, prots, trans)
+    dict2nucleotides2(genes, prots, trans, cds)
     return len(genes)
 
 

diff --git a/funannotate/utilities/gbk2parts.py b/funannotate/utilities/gbk2parts.py
@@ -22,13 +22,13 @@ def __init__(self, prog):
     args = parser.parse_args(args)
 
     # setup output files
-    tblout = args.output+'.tbl'
-    gffout = args.output+'.gff3'
-    protout = args.output+'.proteins.fasta'
-    transout = args.output+'.transcripts.fasta'
-    dnaout = args.output+'.scaffolds.fasta'
-    lib.gb2parts(args.gbk, tblout, gffout, protout, transout, dnaout)
-
+    tblout = f'{args.output}.tbl'
+    gffout = f'{args.output}.gff3'
+    protout = f'{args.output}.proteins.fa'
+    transout = f'{args.output}.mrna-transcripts.fa'
+    cdsout = f'{args.output}.cds-transcripts.fa'
+    dnaout = f'{args.output}.scaffolds.fa'
+    lib.gb2parts(args.gbk, tblout, gffout, protout, transout, cdsout, dnaout)
 
 if __name__ == "__main__":
     main(sys.argv[1:])