/
FindMeHemes.py
executable file
·154 lines (131 loc) · 6.82 KB
/
FindMeHemes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3
# !/bin/sh
from collections import defaultdict
import re
import os
import textwrap
import argparse
import sys
parser = argparse.ArgumentParser(
prog="FindMeHemes.py",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent('''
*******************************************************
*******************************************************
*******************************************************
Script for identifying proteins with heme c-binding motifs
Developed by Arkadiy Garber: agarber4@asu.edu
*******************************************************
*******************************************************
*******************************************************
'''))
parser.add_argument('-contigs', type=str, help='File with contigs in FASTA format. if you provide this, the program '
'will run Prodigal to identify candidate open reading frames, then use'
'those proteins to search for candidate heme-binding '
'sites.', default="NA")
parser.add_argument('-proteins', type=str, help="File with gene-calls in FASTA amino acid format. "
"These proteins will be used to search for candidate heme-binding "
"sites", default="NA")
# parser.add_argument('-outDir', type=str, help="name of directory to which output files will be written")
parser.add_argument('-mode', type=str, help="Is the provided FASTA file a single genome or a metagenome? "
"(default=genome). This is only relevant if you provide contigs, instead of proteins.", default="genome")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(0)
args = parser.parse_known_args()[0]
def replace(stringOrlist, list, item):
emptyList = []
for i in stringOrlist:
if i not in list:
emptyList.append(i)
else:
emptyList.append(item)
outString = "".join(emptyList)
return outString
def remove(stringOrlist, list):
emptyList = []
for i in stringOrlist:
if i not in list:
emptyList.append(i)
else:
pass
outString = "".join(emptyList)
return outString
def fasta(fasta_file):
seq = ''
header = ''
Dict = defaultdict(lambda: defaultdict(lambda: 'EMPTY'))
for i in fasta_file:
i = i.rstrip()
if re.match(r'^>', i):
if len(seq) > 0:
Dict[header] = seq
header = i[1:]
seq = ''
else:
header = i[1:]
seq = ''
else:
seq += i
seq += "\n"
Dict[header] = seq
return Dict
if args.contigs != "NA":
print("Contigs file provided. Starting ORF prediction using Prodigal")
if args.mode == "genome":
os.system("prodigal -q -i " + args.contigs + " -a " + args.contigs + ".proteins.faa -o progidal.out")
print("finished ORF prediction. Beginning identification of heme-binding motifs")
prots = open(args.contigs + ".proteins.faa", "r")
prots = fasta(prots)
outFASTA = open(args.contigs + ".hemeProts.fasta", "w")
outCSV = open(args.contigs + ".hemeProts.csv", "w")
outCSV.write("ORF" + "," + "number_of_hemes" + "\n")
for i in prots.keys():
if re.findall(r'C(..)CH', prots[i]) or re.findall(r'C(...)CH', prots[i]) or re.findall(r'C(....)CH', prots[i]) or re.findall(r'C(..............)CH', prots[i]) or re.findall(r'C(...............)CH', prots[i]):
outCSV.write(replace(i, [","], ";") + "," + str(
len(re.findall(r'C(..)CH', prots[i])) + len(re.findall(r'C(...)CH', prots[i])) + len(re.findall(r'C(....)CH', prots[i])) +
len(re.findall(r'C(..............)CH', prots[i])) + len(re.findall(r'C(...............)CH', prots[i]))) + "\n")
outFASTA.write(">" + i + "\n")
outFASTA.write(prots[i] + "\n")
outFASTA.close()
outCSV.close()
if args.mode == "metagenome":
os.system("prodigal -q -i " + args.contigs + " -a " + args.contigs + ".proteins.faa -p meta -o progidal.out")
print("finished ORF prediction. Beginning identification of heme-binding motifs")
prots = open(args.contigs + ".proteins.faa", "r")
prots = fasta(prots)
outFASTA = open(args.contigs + ".hemeProts.fasta", "w")
outCSV = open(args.contigs + ".hemeProts.csv", "w")
outCSV.write("ORF" + "," + "number_of_hemes" + "\n")
for i in prots.keys():
if re.findall(r'C(..)CH', prots[i]) or re.findall(r'C(...)CH', prots[i]) or re.findall(r'C(....)CH', prots[i]) or re.findall(r'C(..............)CH', prots[i]) or re.findall(r'C(...............)CH', prots[i]):
outCSV.write(replace(i, [","], ";") + "," + str(
len(re.findall(r'C(..)CH', prots[i])) + len(re.findall(r'C(...)CH', prots[i])) + len(
re.findall(r'C(....)CH', prots[i])) +
len(re.findall(r'C(..............)CH', prots[i])) + len(re.findall(r'C(...............)CH', prots[i]))) + "\n")
outFASTA.write(">" + i + "\n")
outFASTA.write(prots[i] + "\n")
outFASTA.close()
outCSV.close()
else:
print("Please provide the program with some information about the type of file that is being processed. Is "
"it a single genome, or a set of assembled contigs from a metagenome or multiple set of genomes? "
"This can be answered via the -mode flag")
if args.proteins != "NA":
print("Amino acid FASTA file is provided. Skipping Prodigal, and moving on to identification of heme-binding motifs")
prots = open(args.proteins, "r")
prots = fasta(prots)
outFASTA = open(args.proteins + ".hemeProts.fasta", "w")
outCSV = open(args.proteins + ".hemeProts.csv", "w")
outCSV.write("ORF" + "," + "number_of_hemes" + "\n")
for i in prots.keys():
if re.findall(r'C(..)CH', prots[i]) or re.findall(r'C(...)CH', prots[i]) or re.findall(r'C(....)CH', prots[i]) or re.findall(r'C(..............)CH', prots[i]) or re.findall(r'C(...............)CH', prots[i]):
outCSV.write(replace(i, [","], ";") + "," + str(
len(re.findall(r'C(..)CH', prots[i])) + len(re.findall(r'C(...)CH', prots[i])) + len(
re.findall(r'C(....)CH', prots[i])) +
len(re.findall(r'C(..............)CH', prots[i])) + len(re.findall(r'C(...............)CH', prots[i]))) + "\n")
outFASTA.write(">" + i + "\n")
outFASTA.write(prots[i] + "\n")
outFASTA.close()
outCSV.close()
print("All done. Thank you for using FindMeHemes!")