/
KEGG-to-anvio
executable file
·49 lines (49 loc) · 2.78 KB
/
KEGG-to-anvio
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python
import pandas as pd
import argparse
"""
Parses annotation results from KEGG and optionally will pull in results from interproscan
Assumed interproscan was ran using the following flags: -f tsv --goterms --iprlookup --pathways
"""
parser = argparse.ArgumentParser(description='Combines annotation Data for input to anvio')
parser.add_argument('--KeggDB',
help='identify the Kegg Orthology file (modified from htext using given bash script)')
parser.add_argument('-i',
help='specify the file containing GhostKoala Results')
parser.add_argument('--interproscan',
help='interproscan results')
parser.add_argument('-o',
help='Specify an output file')
args = parser.parse_args()
arg_dict=vars(args)
keggortho_database = arg_dict['KeggDB']
output = arg_dict['o']
GK_results = arg_dict['i']
##Read In KO_Orthology file and format for downstream analysis##
x= pd.read_table(keggortho_database,header=None)
y =pd.DataFrame(x[3].str.split(' ',1).tolist(),columns=['accession','description'])
xy = pd.concat([x,y],axis=1).drop(3,1).set_index('accession')
xy.columns= ["Category1","Category2","Category3","description"]
xy.to_csv("KeggOrthology_Table1.txt",encoding='utf-8')
keggAnnotation = pd.read_table(GK_results,header=None,names=["gene_callers_id","accession"],index_col=None)
keggAnnotation=keggAnnotation.replace({'genecall_': ''}, regex=True)
keggAnnotation=keggAnnotation.dropna().set_index("accession")
merged = keggAnnotation.join(xy)
merged_reduced = merged.drop_duplicates(subset='gene_callers_id', keep="last")
extracted = merged_reduced.filter(['gene_callers_id','description','accession']).reset_index().set_index('gene_callers_id')
e_value = [0]*len(extracted['accession'].tolist())
source = ['KeggGhostKoala']*len(extracted['accession'].tolist())
extracted.insert(0,'source',source)
extracted.insert(3,'e_value',e_value)
extracted=extracted.rename(columns={'description':'function'},index=str)
print(extracted.head())
if arg_dict["interproscan"] is not None:
interpro = pd.read_table(arg_dict["interproscan"],header=None,names=["gene_callers_id","MD5","Length","source","accession","function","start_loc","stop_loc","e_value","status","date","InterProAccession","InterProDescription","GOAnnotations","Pathway"])
InterProExtracted = interpro.filter(["gene_callers_id","source","accession","function","e_value"])
InterProExtracted = InterProExtracted.replace({'genecall_': ''}, regex=True)
InterProExtracted['e_value'] =InterProExtracted['e_value'].replace('-',0)
InterProExtracted = InterProExtracted.set_index("gene_callers_id")
KEGG_InterPro_Combined = pd.concat([extracted,InterProExtracted])
KEGG_InterPro_Combined.to_csv(output,sep='\t')
else:
extracted.to_csv(output,sep='\t')