-
Notifications
You must be signed in to change notification settings - Fork 2
/
count2.py
134 lines (117 loc) · 4.2 KB
/
count2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import os, sys, string, re, codecs
#path = os.path.dirname(os.path.abspath(__file__)) if __file__ else os.getcwd()
path = os.getcwd() #get current working directory
reMarks = re.compile("[.,:;!?()/<>«»'\"]")
reLineBreaks = re.compile("[\r\n]")
reSpaces = re.compile("[ ][ ]+")
reText=re.compile("(\.txt|\.pdf)$")
dictionary={}
#PyPDF2
#sys.exit()
def clean_text(text):
text = reMarks.sub(' ', text)
text = reSpaces.sub(' ', text)
text = reLineBreaks.sub('', text)
text = text.lower()
text = text.strip()
return text
def read_text(fileName):
f = codecs.open(fileName, 'r', 'utf-8-sig')
lines = f.readlines()
f.close()
return lines
def write_csv(counts, keywordTotalCount, keywordCount, wordCount, fileName):
parts = fileName.split('/')
f = codecs.open(path + '/count.csv', 'a', 'utf-8-sig')
for count in counts:
f.write(fileName + ',' + parts[1] + ',' + parts[2] + ',' + parts[3] + ',' + parts[4].split('_')[0] + ',' + count[0] + ',' + str(count[1])+ ',' + str(keywordTotalCount) + ',' + str(round(keywordTotalCount/keywordCount, 2)) + ',' + str((keywordTotalCount/wordCount)*1000)+ ',' + str(wordCount) + '\n')
f.close()
def write_kwic_csv(counts, kwics, fileName):
parts = fileName.split('/')
f = codecs.open(path + '/kwic.csv', 'a', 'utf-8-sig')
for count in counts:
for kwic in kwics[count[0]]:
f.write(fileName + ',' + parts[1] + ',' + parts[2] + ',' + parts[3] + ',' + parts[4].split('_')[0] + ',' + count[0] + ',' + str(count[1])+ ',"' + kwic + '"\n')
f.close()
def read_dictionary():
f = codecs.open(path + '/dictionary.txt', 'r', 'utf-8-sig')
lines = f.readlines()
f.close()
for line in lines:
if line[0] != '#' and len(line.strip()):
line = line.replace(';', ',')
label = line.strip().split(':')[0].split(',')
words = line.strip().split(':')[1].split(',')
dictionary[label[2]] = words
def text_count(fileName, keywords):
lines = read_text(fileName)
counts = {}
wordCount = 0
for line in lines:
words = clean_text(line).split(' ')
wordCount += len(words)
for word in words:
if len(word) > 0:
if counts.has_key(word):
counts[word] += 1
else:
counts[word] = 1
return [counts, wordCount]
def kwic(fileName, keywords):
print fileName
kwics = {}
lines = read_text(fileName)
for line in lines:
for keyword in keywords: kwics[keyword]=[]
words = clean_text(line).split(' ')
for keyword in keywords:
for i, word in enumerate(words):
print word
print keyword
if word == keyword:
if (i-10)<0:
start=0
else:
start=i-10
if (i+10)>len(words)-1:
end=len(words)-1
else:
end=i+10
kwics[keyword].append(' '.join(words[start: end]))
#for w in keywords:
#print w
#print kwics[w]
return kwics
def find_keywords(keywords, fileName):
counts, wordCount = text_count(fileName, keywords)
keywordCounts = []
keywordTotalCount = 0
keywordCount = 0
for keyword in keywords:
if counts.has_key(keyword):
keywordCounts.append([keyword, counts[keyword]])
keywordTotalCount += counts[keyword]
else:
keywordCounts.append([keyword, 0])
if keyword != 'n/a':
keywordCount += 1
kwics = kwic(fileName, keywords)
return [keywordCounts, keywordTotalCount, keywordCount, wordCount, kwics]
###############################################################################################################################
if os.path.isfile(path + '/count.csv'):
os.remove(path + '/count.csv')
if os.path.isfile(path + '/kwic.csv'):
os.remove(path + '/kwic.csv')
read_dictionary()
for dirName in os.listdir(path + '/txt/'):
ctrcode = dirName
for year in os.listdir(path + '/txt/' + dirName):
for manif in os.listdir(path + '/txt/' + dirName +'/'+ year):
fpath = path+'/txt/'+ dirName + '/'+ year + '/' + manif
keywordCounts, keywordTotalCount, keywordCount, wordCount, kwics = find_keywords(dictionary[ctrcode], fpath)
write_csv(keywordCounts, keywordTotalCount, keywordCount, wordCount, fpath)
write_kwic_csv(keywordCounts, kwics, fpath)
#sys.exit() #stop file here