/
fast_example.py
130 lines (110 loc) · 4.99 KB
/
fast_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from formal import *
from alg import *
from earley import *
from time import time
import libitg
import numpy as np
import sys
def test(lexicon, src_str, tgt_str, inspect_strings=False):
print('TRAINING INSTANCE: |x|=%d |y|=%d' % (len(src_str.split()), len(tgt_str.split())))
print(src_str)
print(tgt_str)
print()
# Make a source CFG using the whole lexicon
src_cfg = libitg.make_source_side_finite_itg(lexicon)
#print('SOURCE CFG')
#print(src_cfg)
#print()
# Make a source FSA
src_fsa = libitg.make_fsa(src_str)
#print('SOURCE FSA')
#print(src_fsa)
#print()
# Make a target FSA
tgt_fsa = libitg.make_fsa(tgt_str)
#print('TARGET FSA')
#print(tgt_fsa)
# Intersect source FSA and source CFG
times = dict()
times['D(x)'] = time()
_Dx = libitg.earley(src_cfg, src_fsa,
start_symbol=Nonterminal('S'),
sprime_symbol=Nonterminal("D(x)"),
clean=False) # to illustrate the difference between clean and dirty forests I will disable clean here
#print(src_forest)
#print()
# projection over target vocabulary
# D(x) is now finite :D
Dx = libitg.make_target_side_finite_itg(_Dx, lexicon)
times['D(x)'] = time() - times['D(x)']
times['clean D(x)'] = time()
Dx_clean = libitg.cleanup_forest(Dx, Nonterminal('D(x)'))
times['clean D(x)'] = time() - times['clean D(x)']
times['D(x,y)'] = time()
Dxy = libitg.earley(Dx, tgt_fsa,
start_symbol=Nonterminal("D(x)"),
sprime_symbol=Nonterminal('D(x,y)'),
clean=False)
times['D(x,y)'] = time() - times['D(x,y)']
times['clean D(x,y)'] = time()
Dxy_clean = libitg.cleanup_forest(Dxy, Nonterminal('D(x,y)'))
times['clean D(x,y)'] = time() - times['clean D(x,y)']
print('D(x): %d rules in %.4f secs or clean=%d rules at extra %.4f secs' % (len(Dx), times['D(x)'],
len(Dx_clean), times['clean D(x)']))
print('D(x,y): %d rules in %.4f secs or clean=%d rules at extra %.4f secs' % (len(Dxy),
times['D(x,y)'], len(Dxy_clean), times['clean D(x,y)']))
if inspect_strings:
t0 = time()
print(' y in D(x):', tgt_str in libitg.summarise_strings(Dx, Nonterminal('D(x)')))
print(' y in D(x,y):', tgt_str in libitg.summarise_strings(Dxy, Nonterminal('D(x,y)')))
print(' y in clean D(x,y):', tgt_str in libitg.summarise_strings(Dxy_clean, Nonterminal('D(x,y)')))
print(' gathering strings took %d secs' % (time() - t0))
# and this is how you pickle things
import dill as pickle
with open('pickle-test', 'wb') as f:
pickle.dump(Dxy_clean, f)
with open('pickle-test', 'rb') as f:
Dloaded = pickle.load(f)
print(len(Dloaded), 'loaded')
print()
if __name__ == '__main__':
# Test lexicon
lexicon = defaultdict(set)
lexicon['le'].update(['-EPS-', 'the', 'some', 'a', 'an']) # we will assume that `le` can be deleted
lexicon['e'].update(['-EPS-', 'and', '&', 'also', 'as'])
lexicon['chien'].update(['-EPS-', 'dog', 'canine', 'wolf', 'puppy'])
lexicon['noir'].update(['-EPS-', 'black', 'noir', 'dark', 'void'])
lexicon['blanc'].update(['-EPS-', 'white', 'blank', 'clear', 'flash'])
lexicon['petit'].update(['-EPS-', 'small', 'little', 'mini', 'almost'])
lexicon['petite'].update(['-EPS-', 'small', 'little', 'mini', 'almost'])
lexicon['.'].update(['-EPS-', '.', '!', '?', ','])
#lexicon['-EPS-'].update(['.', ',', 'a', 'the', 'some', 'of', 'bit', "'s", "'m", "'ve"]) # we will assume that `the` and `a` can be inserted
lexicon['-EPS-'].update(['.', 'a', 'the', 'some', 'of']) # we will assume that `the` and `a` can be inserted
print('LEXICON')
for src_word, tgt_words in lexicon.items():
print('%s: %d options' % (src_word, len(tgt_words)))
print()
test(lexicon,
'le chien noir',
'black dog',
inspect_strings=False)
test(lexicon,
'le chien noir',
'the black dog .',
inspect_strings=False)
test(lexicon,
'le petit chien noir e le petit chien blanc .',
'the little white dog and the little black dog .')
test(lexicon,
'le petit chien noir e le petit chien blanc .',
'the little white dog and the little black dog .')
test(lexicon,
'le petit chien noir e le petit chien blanc e le petit petit chien .',
'the little black dog and the little white dog and the mini dog .')
test(lexicon,
'le petit chien noir e le petit chien blanc e le petit chien petit blanc .',
'the little black dog and the little white dog and the mini almost white dog .')
print('**** The next example should be out of the space of the constrained ITG ***')
test(lexicon,
'le petit chien noir e le petit chien blanc e le petit petit chien petit blanc e petit noir .',
'the little black dog and the little white dog and the dog a bit white and a bit black .')