This repository has been archived by the owner on Oct 18, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
gazetteer.py
87 lines (76 loc) · 2.47 KB
/
gazetteer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import json, tqdm, sys, os
import pandas as pd
'''
mvnt.tsv is a dataset downloaded from IMDB.
Only the following columns have been saved:
* primaryTitle - most popular title
* startYear - start of release
* endYear - end of release (TV shows, etc.)
'''
gaz = pd.read_csv("mvnt.tsv", delimiter="\t")
gaz.drop("Unnamed: 0", axis=1, inplace=True)
gaz.dropna(subset=["title"], inplace=True)
abbr_map = {
"yrs" : "years",
"min" : "minutes"
}
# Testing the Affix Tree
# gaz = {"title": ["Caller", "Callee", "Called", "Calles"]}
class Affix_tree:
def __init__(self, loading_file=None):
if loading_file != None:
print("Loading the Gazetteer")
with open(loading_file) as file:
self.tree = json.loads(file.read())
else:
self.tree = dict()
def add(self, title):
x = self.tree
try:
for character in title:
if character not in x.keys():
x[character] = {}
x = x[character]
x["end"] = True
except:
raise ValueError("Came across an error while processing:", title)
sys.exit(1)
def has(self, title):
title = title.lower()
x = self.tree
for character in title:
if character.lower() in x.keys() or character.upper() in x.keys():
x = x[character]
else:
return False
if x["end"]:
return True
def save(self):
print("Saving the Gazetteer")
with open("affix.tree", "w") as file:
json.dump(self.tree, file)
def getMatchesInSentence(self, string):
matches = []
for start in range(len(string)):
x = self.tree
if string[start] in x.keys():
for next_char in range(len(string)-start):
if string[next_char]:
...
def delete(self):
if os.path.exists("affix.tree"):
os.remove("affix.tree")
print("Successfully Deleted the Gazetteer")
else:
print("No 'affix.tree' file found in directory to delete.")
if __name__ == "__main__":
# to create the affix.tree file
# tree = Affix_tree()
# tree.delete()
# print("Building the Gazetteer")
# for title in tqdm.tqdm(gaz["title"]):
# tree.add(title.lower())
# tree.save()
# to test loading...
x = Affix_tree(loading_file = "affix.tree")
print(x.has("I"))