/
bigrams.py
95 lines (79 loc) · 3.6 KB
/
bigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import csv
from pathlib import Path
from typing import Optional, Any
kCorpusDir = Path('E:\dh\CHICAGO_CORPUS\CHICAGO_NOVEL_CORPUS')
kStopWords = Path('./nltk_english_stopwords.txt')
kCountsPath = Path('chicago_counts.csv')
kBigramPath = Path('chicago_bigrams.csv')
kCountsDir = Path('./counts/')
kBigramDir = Path('./bigrams/')
def read_stopwords() -> set:
stopwords = set()
with open(kStopWords) as f:
for line in f.readlines():
stopwords.add(line.strip())
return stopwords
def sort_counts(counts) -> list:
"""
Takes in a dictionary of counts, and returns a sorted list of tuples containing the words and their counts. We won't
cover how this works in class, but it's essentially a compact version of looping over the dictionary to make all the
word-count pairs and put them into the list, and then sorting the list by the second item in each of those pairs
(the count)
"""
return sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
def main():
# count all the words in string
corpus_counts = {}
corpus_bigrams = {}
corpus_len = 0
count = 0
for file in kCorpusDir.iterdir():
# print(f'Reading {file.name}')
if file.name.endswith(".txt"):
try:
counts_path = Path("./counts/" + file.name).with_suffix(".csv")
with open(file, 'r', encoding="UTF-8") as f:
text = f.read()
counts, bigrams = count_words_and_bigrams(text, corpus_counts, corpus_bigrams)
for count in counts.values():
corpus_len += count
sorted_counts = sort_counts(counts)
sorted_bigrams = sort_counts(bigrams)
counts_path = Path(kCountsDir + file.name).with_suffix(".csv")
with open(counts_path, 'w', encoding="UTF-8") as f:
writer = csv.writer(f, dialect=csv.unix_dialect)
writer.writerow(['word', 'count'])
writer.writerows(sorted_counts)
bigrams_path = Path(kCountsPath + file.name).with_suffix(".csv")
with open(bigrams_path, 'w', encoding="UTF-8") as f:
writer = csv.writer(f, dialect=csv.unix_dialect)
writer.writerow(['bigram', 'count'])
writer.writerows(sorted_bigrams)
except UnicodeError as e:
print(f"OS / encoding error while reading file {file.name}")
print(e)
count += 1
if count % 100 == 0:
print(f'{count} files read so far')
# norm corpus-level counts:
for word in corpus_counts:
corpus_counts[word] = corpus_counts[word] / corpus_len
for bigram in corpus_bigrams:
corpus_bigrams[bigram] = corpus_bigrams[bigram] / corpus_len
# write counts dict to file
sorted_counts = sort_counts(corpus_counts)
with open(kCountsPath, 'w', encoding="UTF-8") as outfile:
writer = csv.writer(outfile, dialect=csv.unix_dialect)
writer.writerow(['word', 'count'])
for count in sorted_counts:
writer.writerow(count)
# write bigrams to file
sorted_bigrams = sort_counts(corpus_bigrams)
with open(kBigramPath, 'w', encoding="UTF-8") as outfile:
writer = csv.writer(outfile, dialect=csv.unix_dialect)
writer.writerow(['bigram', 'count'])
for bigram_count in sorted_bigrams:
writer.writerow(bigram_count)
return 0
if __name__ == '__main__':
main()