/
wordstat_old.py
189 lines (145 loc) · 4.6 KB
/
wordstat_old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from __future__ import division
import nltk, re, pprint
from nltk import FreqDist
from nltk import word_tokenize
from nltk.collocations import *
import io
import string
import sys
import getopt
import pickle
in_file = False
out_file = False
min_freq = 50
min_len = 3
phrase_len = 1
out_dir = "out/"
preload_tokens = False
tokenize_mode = False
token_file = ""
conc_term = ""
def parseArgs ():
global in_file, out_file, min_freq, min_lin, out_dir,preload_tokens,tokenize_mode,token_file, conc_term
print "Parsing arguments..."
try:
opts, args = getopt.getopt(sys.argv[1:],
"hi:o:m:f:bt:c:z",
["in_file=",
"out_file=",
"min_len=",
"min_freq=",
"bigram",
"token_file=",
"tokenize",
"conc="])
except getopt.GetoptError:
print 'Usage message tmp: options: -i -o -h -m -f -b -t'
sys.exit(2)
print opts
print sys.argv
for opt, arg in opts:
print opt + " " + arg
if opt == '-h':
print 'test.py -i <inputfile> -o <outputfile>'
sys.exit()
elif opt in ("-i", "--in_file"):
in_file = arg
elif opt in ("-o", "--out_file"):
out_file = arg
elif opt in ("-l", "--min_len"):
max_len = arg
elif opt in ("-f", "--min_frequency"):
min_freq = arg
elif opt in ("-b", "--bigram"):
phrase_len = 2
elif opt in ("-t", "--token_file"):
preload_tokens = True
token_in_file = arg
elif opt in ("-z", "--tokenize"):
tokenize_mode = True
print "Tokenize mode..."
elif opt in ("-c", "--conc"):
conc_term = arg
def printTop(n,min_len):
out_file = out_dir + "output_min_" + str(n) + "_len_gt_" + str(min_len) + ".csv"
fdist1 = FreqDist(tokens)
most_common = fdist1.most_common(n)
out_fh = open(out_file, "w")
header = "word,frequency\n"
out_fh.write(header);
for tup in most_common:
if (len(tup[0]) >= min_len):
line = "\"" + tup[0] + "\"," + str(tup[1]) + "\n"
# print line
out_fh.write(line)
out_fh.close()
def printTopBigrams(n):
out_file = out_dir + "output_bigrams.csv"
bigrams = nltk.bigrams(tokens)
fdist1 = nltk.FreqDist(bigrams)
most_common = fdist1.most_common(2000)
out_fh = open(out_file,"w")
header = "word1,word2,frequency\n"
out_fh.write(header);
for tup in most_common:
#print tup
line = "\"" + str(tup[0][0]) + "\",\"" + str(tup[0][1]) + "\"," + str(tup[1]) + "\n"
#print line
out_fh.write(line)
out_fh.close()
def context(word):
return nltk.ConcordanceIndex(tokens)
def context2(word, word2):
#len = 255
trail_len = 255 - len(word) - len(word2)
cont = context(word)
strs = []
for i in cont.offsets(word):
start = max(0, i - trail_len)
end = end = min(i+120,len(raw))
# print str(start) + ":" + str(end)
excerpt = raw[start:end]
if word2 in excerpt:
strs.append(excerpt)
return strs
# read in a text file and output a token file
def tokenize():
# Process input file
#-----------------------------
f = io.open (in_file)
raw = f.read().lower()
f.close()
printable = set(string.printable)
raw = filter(lambda x: x in printable, raw)
raw = filter(lambda x: x.isalpha() or x.isspace(), raw)
print "Opened file " + in_file + " of size " + str(len(raw)) + " bytes."
print "Tokenizing..."
tokens = word_tokenize(raw)
return tokens
def tokenize_to_file():
if in_file == False or out_file == False:
print "Error: Tokenize mode requires input and output files"
exit (2)
# save to binary file
#-----------------------------
tokens = tokenize()
if ".token" not in out_file:
out_file += ".token"
of = open(out_file,"w")
pickle.dump(tokens, of)
of.close()
def main():
parseArgs()
if tokenize_mode:
tokenize_to_file()
exit(1)
else:
print "Tokenize mode: false..."
def oldMain():
set(word.lower() for word in raw if word.isalpha())
print "Found " + str(len(tokens)) + " tokens."
#printTop(2000, 1)
print "Now processing bigrams"
#printTopBigrams(50)
print "Done!"
main()