/
patricia.py
378 lines (332 loc) · 12.5 KB
/
patricia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
"""
A PATRICIA trie implementation for efficient matching of string collections on
text.
This class has an (Py2.7+) API nearly equal to dictionaries.
*Deleting* entries is a "half-supported" operation only. The key appears
"removed", but the trie is not actually changed, only the node state is
changed from terminal to non-terminal. I.e., if you frequently delete keys,
the compaction will become fragmented and less efficient. To mitigate this
effect, make a copy of the trie (using a copy constructor idiom)::
T = trie(**T)
If you are only interested in scanning for the *presence* of keys, but do not
care about mapping a value to each key, using `None` as the value of your
keys and scanning with ``key(S, i, j, None)`` at every offset ``i:j`` in the
string ``S`` is perfectly fine (because the return value will be the key
string iff a full match was made and `None` otherwise). In other words, it
is not necessary to create slices of strings to scan in a window only::
>>> T = trie(present=None)
>>> T.key('is absent here', 3, 9, None) # scan in second word [3:9]
>>> T.key('is present here', 3, 10, None) # scan in second word [3:10]
'present'
License: Apache License v2 (http://www.apache.org/licenses/LICENSE-2.0.html)
"""
__author__ = 'Florian Leitner <florian.leitner@gmail.com>'
__version__ = '10'
class _NonTerminal():
pass
__NON_TERMINAL__ = _NonTerminal()
# recursion functions
def _count(node):
"Count the number of terminal nodes in this branch."
count = 0 if (node._value is __NON_TERMINAL__) else 1
for _, child in node._edges.values():
count += _count(child)
return count
def _keys(node, accu):
"Yield keys of terminal nodes in this branch."
for key, value in _items(node, accu):
yield key
def _items(node, accu):
"Yield key, value pairs of terminal nodes in this branch."
if node._value is not __NON_TERMINAL__:
yield ''.join(accu), node._value
for edge, child in node._edges.values():
accu.append(edge)
for key, value in _items(child, accu):
yield key, value
accu.pop()
def _values(node):
"Yield values of terminal nodes in this branch."
if node._value is not __NON_TERMINAL__:
yield node._value
for edge, child in node._edges.values():
for value in _values(child):
yield value
class trie():
"""
Usage Example::
>>> T = trie('root', key='value', king='kong') # a root and two nodes
>>> T['four'] = None # setting new values as in a dict
>>> '' in T # check if the value exits (note: the [empty] root is '')
True
>>> 'kong' in T # existence checks as in a dict
False
>>> T['king'] # get the value for an exact key ... as in a dict!
'kong'
>>> T['kong'] # error from non-existing keys (as in a dict...)
Traceback (most recent call last):
...
KeyError: 'kong'
>>> len(T) # count keys ("terminals") in the tree
4
>>> sorted(T) # plus "traditional stuff": keys(), values(), and items()
['', 'four', 'key', 'king']
>>> # scanning a text S with key(S), value(S), and item(S):
>>> S = 'keys and kewl stuff'
>>> T.key(S) # report the (longest) key that is a prefix of S
'key'
>>> T.value(S, 9) # using offsets; NB: empty root always matches!
'root'
>>> del T[''] # interlude: deleting keys and root is the empty key
>>> T.item(S, 9) # raise error if no key is a prefix of S
Traceback (most recent call last):
...
KeyError: 'k'
>>> # info: the error string above contains the matched path so far
>>> T.item(S, 9, default=None) # avoid the error by specifying a default
(None, None)
>>> # iterate all matching content with keys(S), values(S), and items(S):
>>> list(T.items(S))
[('key', 'value')]
>>> T.isPrefix('k') # reverse lookup: check if S is a prefix of any key
True
>>> T.isPrefix('kong')
False
>>> sorted(T.iter('k')) # and get all keys that have S as prefix
['key', 'king']
"""
def __init__(self, *value, **branch):
"""
Create a new tree node.
Any arguments will be used as the ``value`` of this node.
If keyword arguments are given, they initialize a whole ``branch``.
Note that `None` is a valid value for a node.
"""
self._edges = {}
self._value = __NON_TERMINAL__
if len(value):
if len(value) == 1:
self._value = value[0]
else:
self._value = value
for key, val in branch.items():
self[key] = val
@staticmethod
def __offsets(strlen, start, end):
# Return the correct start, end offsets for a string of length `strlen`.
return (max(0, strlen + start) if start < 0 else start,
strlen if end is None else end)
@staticmethod
def __check(value, match, default):
if value is not __NON_TERMINAL__:
return match, value
elif default is not __NON_TERMINAL__:
return None, default
else:
raise KeyError(match)
def _find(self, path, start, *end):
if start < len(path) and path[start] in self._edges:
edge, child = self._edges[path[start]]
if path.startswith(edge, start, *end):
return child, start + len(edge)
return None, start # return None
def _next(self, path, start, *end):
try:
edge, child = self._edges[path[start]]
if path.startswith(edge, start, *end):
return child, start + len(edge)
except KeyError:
pass
raise KeyError(path) # raise error
def _scan(self, rvalFun, string, start=0, *end):
node = self
start, _ = trie.__offsets(len(string), start, None)
while node is not None:
if node._value is not __NON_TERMINAL__:
yield rvalFun(string, start, node._value)
node, start = node._find(string, start, *end)
def __setitem__(self, key, value):
node = self
keylen = len(key)
idx = 0
while keylen != idx:
if key[idx] in node._edges:
node, idx = node.__followEdge(key, idx)
else:
# no common prefix, create a new edge and (leaf) node
node._edges[key[idx]] = (key[idx:], trie(value))
break
else:
node._value = value
def __followEdge(self, key, idx):
edge, child = self._edges[key[idx]]
if key.startswith(edge, idx):
# the whole prefix matches; advance
return child, idx + len(edge)
else:
# split edge after the matching part of the key
pos = 1
last = min(len(edge), len(key) - idx)
while pos < last and edge[pos] == key[idx + pos]:
pos += 1
split = trie()
split._edges[edge[pos]] = (edge[pos:], child)
self._edges[key[idx]] = (edge[:pos], split)
return split, idx + pos
def __getitem__(self, key):
node = self
keylen = len(key)
idx = 0
while keylen != idx:
node, idx = node._next(key, idx)
if node._value is __NON_TERMINAL__:
raise KeyError(key)
else:
return node._value
def __delitem__(self, key):
node = self
keylen = len(key)
idx = 0
while keylen != idx:
node, idx = node._next(key, idx)
if node._value is __NON_TERMINAL__:
raise KeyError(key)
node._value = __NON_TERMINAL__
def __contains__(self, key):
node = self
keylen = len(key)
idx = 0
while idx != keylen and node is not None:
node, idx = node._find(key, idx)
return False if node is None else (node._value is not __NON_TERMINAL__)
def __iter__(self):
return _keys(self, [])
def __len__(self):
return _count(self)
def __repr__(self):
string = ['trie({']
first = True
for key, value in _items(self, []):
if first:
first = False
else:
string.append(', ')
string.append(repr(key))
string.append(': ')
string.append(repr(value))
string.append('})')
return ''.join(string)
def key(self, string, start=0, end=None, default=__NON_TERMINAL__):
"""
Return the longest key that is a prefix of ``string`` (beginning at
``start`` and ending at ``end``).
If no key matches, raise a `KeyError` or return the ``default`` value
if it was set.
"""
return self.item(string, start, end, default)[0]
def keys(self, *scan):
"""
Return all keys (that are a prefix of ``string``
(beginning at ``start`` (and terminating before ``end``))).
"""
l = len(scan)
if l == 0:
return _keys(self, [])
else:
if l == 1:
scan = (scan[0], 0)
getKey = lambda string, idx, value: string[scan[1]:idx]
return self._scan(getKey, *scan)
def value(self, string, start=0, end=None, default=__NON_TERMINAL__):
"""
Return the value of the longest key that is a prefix of ``string``
(beginning at ``start`` and ending at ``end``).
If no key matches, raise a `KeyError` or return the ``default`` value
if it was set.
"""
return self.item(string, start, end, default)[1]
def values(self, *scan):
"""
Return all values (for keys that are a prefix of ``string``
(beginning at ``start`` (and terminating before ``end``))).
"""
l = len(scan)
if l == 0:
return _values(self)
else:
if l == 1:
scan = (scan[0], 0)
getValue = lambda string, idx, value: value
return self._scan(getValue, *scan)
def item(self, string, start=0, end=None, default=__NON_TERMINAL__):
"""
Return the key, value pair of the longest key that is a prefix of
``string`` (beginning at ``start`` and ending at ``end``).
If no key matches, raise a `KeyError` or return the `None`,
``default`` pair if any ``default`` value was set.
"""
node = self
strlen = len(string)
start, end = trie.__offsets(strlen, start, end)
idx = start
last = self._value
while idx < strlen:
node, idx = node._find(string, idx, end)
if node is None:
break
elif node._value is not __NON_TERMINAL__:
last = node._value
return trie.__check(last, string[start:idx], default)
def items(self, *scan):
"""
Return all key, value pairs (for keys that are a prefix of ``string``
(beginning at ``start`` (and terminating before ``end``))).
"""
l = len(scan)
if l == 0:
return _items(self, [])
else:
if l == 1:
scan = (scan[0], 0)
getItem = lambda string, idx, value: (string[scan[1]:idx], value)
return self._scan(getItem, *scan)
def isPrefix(self, prefix):
"Return True if any key starts with ``prefix``."
node = self
plen = len(prefix)
idx = 0
while idx < plen:
len_left = plen - idx
for edge, child in node._edges.values():
e = edge[:len_left] if (len_left < len(edge)) else edge
if prefix.startswith(e, idx):
node = child
idx += len(edge)
break
else:
return False
return True
def iter(self, prefix):
"Return an iterator over all keys that start with ``prefix``."
node = self
plen = len(prefix)
idx = 0
while idx < plen:
try:
node, idx = node._next(prefix, idx)
except KeyError:
break
return node._accumulate(prefix, idx)
def _accumulate(self, prefix, idx):
node = self
accu = [prefix]
if idx != len(prefix):
remainder = prefix[idx:]
for edge, child in node._edges.values():
if edge.startswith(remainder):
node = child
accu.append(edge[len(remainder):])
break
else:
return iter([])
return _keys(node, accu)