/
getKOfromBlastx.py
44 lines (33 loc) · 1.68 KB
/
getKOfromBlastx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Python script to get KEGG Orthology for transcriptome
import sys, re, os
from optparse import OptionParser
desc = """ get Kegg Orthology from Blast file"""
parser = OptionParser(description = desc)
### input ###
parser.add_option("--blastx", help = "name of the blastx output table (matches.m8)" , action="store", type="string", dest="blastx")
parser.add_option("--spToKO", help = "name of the swiss_prot to kegg orthology conversion file (from http://www.genome.jp/linkdb/)" , action="store", type="string", dest="spKO")
#note: spToKO file must be downloaded via http://www.genome.jp/linkdb/ web interface. On the map, first click "swiss-prot," then click "Orthology."
#Then click "Download" (just below the map) to download the file containing swiss-prot to kegg orthology conversion data.
(opts, args) = parser.parse_args()
#get sp2ko conversion
sp2ko = {trans.split(":")[1]: sp.split(":")[1] for (trans,sp,extra) in (line.strip().split("\t") for line in (open(opts.spKO, 'r')))}
header= ['Transcript_id', 'BLASTX','Kegg_Orthology']
out = open(opts.blastx.split('.m8')[0] + '_keggOrthology.txt', 'w')
out.write("\t".join(header) + "\n")
with open(opts.blastx) as f:
prev = ''
for line in f:
line = line.strip().split('\t')
contig = line[0]
if contig == prev:
continue #use the top e-val result, if multiple (=first hit)
else:
prev = contig
spHit = line[1]
spShort = spHit.split('|')[1]
hitValue = sp2ko.get(spShort, None)
#if spShort in sp2ko.keys():
if hitValue is not None:
keggOrthology = sp2ko[spShort]
out.write(contig + '\t' + spHit + '\t' + keggOrthology + '\n')
f.close()