/
the_big_bang.py
123 lines (77 loc) · 3.58 KB
/
the_big_bang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
## The Big Bang ##
# In this exercise you will implement a text search engine.
# * You can use one store object with any data structure that suits your idea.
# * The tests are divided into steps, please pass each step at a time, but keep previous steps working.
# * Try to complete as many steps as you can but keep in mind the complexity and quality of your solution.
#
#
# Few tips before you go -
# * We suggest you to read all tests and assert messages before you start implementing, it will save you some time in the advanced steps.
# * Feel free to install and import any external package that might help you, but make sure that the implementation of the DB is yours
# (and not imported from the external package).
from typing import List
def _generate_data(db):
for i, row in enumerate(open("./transcript.txt", 'r').readlines()):
if row.strip():
index(db, row.strip(), id_=i)
### Your Code ###
# Change this line to point to your DB object.
db = None
def index(db, text: str, id_: int):
"""
Stores the document and have it available to search.
:param db: The data structure to use.
:param text: The text to index to the DB.
:param id_: The ID for the given doc.
"""
raise NotImplemented("Implement Me!")
def match(db, text: str) -> List[int]:
"""
Returns the IDs of documents that contained ANY of the words in this text.
This operation is case-insensitive.
:param db: The data structure to use.
:param text: text that we want to search for.
:return: The list of matching document IDs.
"""
raise NotImplemented("Implement Me!")
### Sanity Tests ###
# Step 1:
index(db, "Our whole universe was in a hot, dense state", id_=1)
assert match(db, "universe") == [1], "The word 'universe' should appear in the DB"
# Step 2:
index(db, "Then nearly fourteen billion expansion ago expansion started, wait!", id_=1)
assert match(db, "It all started with the big bang!") == [1], "The word 'started' should appear in the DB"
assert match(db, "AGO") == [1], "The word 'ago' should appear in the DB"
# Step 3:
index(db, "Our best and brightest figure that it'll make an even bigger bang!", id_=1)
index(db, "Music and mythology, Einstein and astrology. It all started with the big bang!", id_=2)
assert match(db, "BANG") == [1, 2], 'The word "bang" should appear in the DB multiple times'
# Step 4:
index(db, "It's expanding ever outward but one day", id_=1)
assert match(db, "expanding") == [1], 'Document with id = 1 contains the word "expanding"'
index(db, "Our best and brightest figure that it'll make an even bigger bang!", id_=1)
assert match(db, "expanding") == [], "Document with id = 1 was overridden by a new doc that does not contain the word expanding"
assert match(db, "brightest") == [1], "Document with id = 1 contains the word 'brightest'"
# Step 5:
index(db, "It doesn't need proving", id_=1)
assert match(db, "prove") == [1], "Our search should support variations match, so in this case it should find all documents containing - proving, prove, proves, proved.."
# Step 6:
_generate_data(db)
assert len(match(db, 'jedi')) >= 70, "expected more appearances of the word 'jedi'"
### Performance Tests ###
import time
class Timer:
def __init__(self):
self.start = None
self.end = None
self.duration = None
def __enter__(self):
self.start = time.time()
def __exit__(self, type, value, traceback):
self.end = time.time()
self.duration = self.end - self.start
t = Timer()
with t:
match(db, "jedi")
assert t.duration < 0.0001, "Too slow :("