-
Notifications
You must be signed in to change notification settings - Fork 9
/
lg_example.py
138 lines (120 loc) · 6.11 KB
/
lg_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import lightgrep as lg
# Using the with statement correctly releases lightgrep resources when block
# closes; better to loop over files/string within the lightgrep with statement,
# of course, as Lightgrep initialization is relatively heavyweight
searchString = "hello, World O'Sullivan, please don't bl0w up Nain s\\09-123/12-002 s\\EU-12-23 s\\AU-13-059 "
testString = "hello, World"
searchData = searchString.encode('utf-8')
testData = testString.encode('utf-8')
print(f"searchString: {searchString}")
keys = [
("hello", ["UTF-8", "ISO-8859-1"], lg.KeyOpts(fixedString=True, caseInsensitive=False)),
("world", ["UTF-8"], lg.KeyOpts(fixedString=True, caseInsensitive=True)),
# bl0w
("bl\\dw", ["UTF-8"], lg.KeyOpts(fixedString=False, caseInsensitive=True)),
("[^a-z]+", ["UTF-8"], lg.KeyOpts(fixedString=False, caseInsensitive=True)),
# Backslash must be escaped once for Python, and again for Lightgrep
("s\\\\((A|E)U\\-)?\\d{1,3}-\\d{1,4}[^a-zA-Z0-9]", ["UTF-8"], lg.KeyOpts(fixedString=False, caseInsensitive=False)),
# Or you can use an r-string to avoid Python escaping
(r'\\[\d-]+', ["UTF-8"], lg.KeyOpts(fixedString=False, caseInsensitive=False))
]
# Using with to open a Lightgrep object and perform a search,
# passing keys and callback at init.
print("============================")
print("Results using 'with'")
withHits = lg.HitAccumulator()
with lg.make_program_from_patterns(keys, lg.ProgOpts()) as prog:
with lg.Context(prog, lg.CtxOpts()) as ctx:
# call .encode() on a string to get a bytes object back, then pass into bytearray
withHitCount = ctx.searchBuffer(searchData, withHits)
print(f"{withHitCount} hits found")
for h in withHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
# hBytes = searchData[h['start']:h['end']]
# hText = hBytes.decode("utf-8")
# print(f" hit text: '{hText}'")
withHits.reset()
# Bad input results in exceptions
print("---------------------------")
print("Exception due to a malformed pattern")
with lg.Pattern() as pat:
try:
pat.parse('+++', lg.KeyOpts())
except RuntimeError as e:
print(e)
print("---------------------------")
print("Results creating program and pattern map separately from context")
# Creating the program and pattern map separately from the context
with lg.Program(0) as prog:
with lg.Pattern() as pat:
with lg.Fsm(0) as fsm:
fsm.add_patterns(prog, pat, keys)
prog.compile(fsm, lg.ProgOpts())
myHits = lg.HitAccumulator()
with lg.Context(prog, lg.CtxOpts()) as ctx:
myHitCount = ctx.searchBuffer(searchData, myHits)
print(f"{myHitCount} hits found")
for h in myHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
myHits.reset()
print("---------------------------")
print("Results reusing context with different data")
myHitCount = ctx.searchBuffer(testData, myHits)
print(f"{myHitCount} hits found")
for h in myHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
myHits.reset()
print("---------------------------")
print("Results reusing context again with and startswith()")
myHitCount = ctx.searchBufferStartswith(searchData, myHits)
print(f"{myHitCount} hits found")
for h in myHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
myHits.reset()
print("---------------------------")
print("Results adding patterns one at a time")
# Creating the program and pattern map separately from the context
with lg.Program(0) as prog:
with lg.Pattern() as pat:
with lg.Fsm(0) as fsm:
for i, k in enumerate(keys):
pat.parse(k[0], k[2])
for enc in k[1]:
fsm.add_pattern(prog, pat, enc, i)
prog.compile(fsm, lg.ProgOpts())
myHits = lg.HitAccumulator()
with lg.Context(prog, lg.CtxOpts()) as ctx:
myHitCount = ctx.searchBuffer(searchData, myHits)
print(f"{myHitCount} hits found")
for h in myHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
myHits.reset()
print("---------------------------")
print("Results reusing context with different data")
myHitCount = ctx.searchBuffer(testData, myHits)
print(f"{myHitCount} hits found")
for h in myHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
myHits.reset()
print("---------------------------")
print("Results reusing context again with and startswith()")
myHitCount = ctx.searchBufferStartswith(searchData, myHits)
print(f"{myHitCount} hits found")
for h in myHits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
myHits.reset()
print("---------------------------")
print("Results from a search across buffers")
keys = [('hijk', ['UTF-8'], lg.KeyOpts(fixedString=True, caseInsensitive=False)),]
hits = lg.HitAccumulator()
buf1 = b'abcdefghi'
buf2 = b'jklmnopqr'
with lg.make_program_from_patterns(keys, lg.ProgOpts()) as prog:
with lg.Context(prog, lg.CtxOpts()) as ctx:
ctx.search(buf1, 0, hits)
ctx.search(buf2, len(buf1), hits)
ctx.closeout(hits)
print(f"{len(hits.Hits)} hits found")
for h in hits.Hits:
print(f"hit at [{h['start']},{h['end']}) on keyindex {h['keywordIndex']}, pattern is '{h['pattern']}' with encoding chain '{h['encChain']}'")
hits.reset()