/
features.py
198 lines (181 loc) · 8.93 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import libitg
from libitg import Symbol, Terminal, Nonterminal, Span
from libitg import Rule, CFG
from libitg import FSA
from collections import defaultdict
import numpy as np
LOG_PROB_DEFAULT = [-736.82724089097394, -736.82724089097394]
def get_source_word(fsa: FSA, origin: int, destination: int) -> str:
"""Returns the python string representing a source word from origin to destination (assuming there's a single one)"""
labels = list(fsa.labels(origin, destination))
assert len(labels) == 1, 'Use this function only when you know the path is unambiguous, found %d labels %s for (%d, %d)' % (len(labels), labels, origin, destination)
return labels[0]
def get_target_word(symbol: Symbol):
"""Returns the python string underlying a certain terminal (thus unwrapping all span annotations)"""
if not symbol.is_terminal():
raise ValueError('I need a terminal, got %s of type %s' % (symbol, type(symbol)))
return symbol.root().obj()
def get_bispans(symbol: Span):
"""
Returns the bispans associated with a symbol.
The first span returned corresponds to paths in the source FSA (typically a span in the source sentence),
the second span returned corresponds to either
a) paths in the target FSA (typically a span in the target sentence)
or b) paths in the length FSA
depending on the forest where this symbol comes from.
"""
if not isinstance(symbol, Span):
#raise ValueError('I need a span, got %s of type %s' % (symbol, type(symbol)))
return (None, symbol.obj()), (None, None)
s, start2, end2 = symbol.obj() # this unwraps the target or length annotation
if len(s.obj()) == 3:
_, start1, end1 = s.obj() # this unwraps the source annotation
else: return (None, s.obj()), (None, None)
return (start1, end1), (start2, end2)
def simple_features(edge: Rule, src_fsa: FSA, eps=Terminal('-EPS-'),
sparse_del=False, sparse_ins=False, sparse_trans=False, sparse_span=False, ibm1_probs=False, embeddings=False, skip_grams=False) -> dict:
"""
Featurises an edge given
* rule and spans
* src sentence as an FSA
* TODO: target sentence length n
* TODO: extract IBM1 dense features
crucially, note that the target sentence y is not available!
"""
fmap = defaultdict(float)
if len(edge.rhs) == 2: # binary rule
fmap['type:binary'] += 1.0
# here we could have sparse features of the source string as a function of spans being concatenated
(ls1, ls2), (lt1, lt2) = get_bispans(edge.rhs[0]) # left of RHS
(rs1, rs2), (rt1, rt2) = get_bispans(edge.rhs[1]) # right of RHS
# TODO: double check these, assign features, add some more
if ls1 == ls2: # deletion of source left child
fmap['order:slcdel'] += 1.0
if rs1 == rs2: # deletion of source right child
fmap['order:srcdel'] += 1.0
if ls2 == rs1: # monotone
fmap['order:mon'] += 1.0
if ls1 == rs2: # inverted
fmap['order:inv'] += 1.0
else: # unary
symbol = edge.rhs[0]
if symbol.is_terminal(): # terminal rule
fmap['type:terminal'] += 1.0
# we could have IBM1 log probs for the traslation pair or ins/del
(s1, s2), (t1, t2) = get_bispans(symbol)
if symbol.root() == eps: # symbol.root() gives us a Terminal free of annotation
# for sure there is a source word
src_word = get_source_word(src_fsa, s1, s2)
fmap['type:deletion'] += 1.0
# dense versions (for initial development phase)
if ibm1_probs:
fmap['ibm1:del:logprob'] += ibm1_probs.get(src_word, {}).get('-EPS-', LOG_PROB_DEFAULT)[1]
# sparse version
if sparse_del:
fmap['del:%s' % src_word] += 1.0
else:
# for sure there's a target word
tgt_word = get_target_word(symbol)
if s1 == s2: # has not consumed any source word, must be an eps rule
fmap['type:insertion'] += 1.0
# dense version
if ibm1_probs:
fmap['ibm1:ins:logprob'] += ibm1_probs.get('-EPS-', {}).get(tgt_word, LOG_PROB_DEFAULT)[0]
# sparse version
if sparse_ins:
fmap['ins:%s' % tgt_word] += 1.0
else:
# for sure there's a source word
src_word = get_source_word(src_fsa, s1, s2)
fmap['type:translation'] += 1.0
# dense version
if ibm1_probs:
fmap['ibm1:x2y:logprob'] += ibm1_probs.get(src_word, {}).get(tgt_word, LOG_PROB_DEFAULT)[0]
fmap['ibm1:y2x:logprob'] += ibm1_probs.get(src_word, {}).get(tgt_word, LOG_PROB_DEFAULT)[1]
# sparse version
if sparse_trans:
fmap['trans:%s/%s' % (src_word, tgt_word)] += 1.0
else: # S -> X
fmap['top'] += 1.0
# span of source and target side
(ls1, ls2), (lt1, lt2) = get_bispans(edge.lhs)
if ls1 == None:
fmap['root:%s' % (ls2)] += 1
elif sparse_span:
fmap['span:source:%i' % (ls2-ls1)] += 1.0
fmap['span:target:%i' % (lt2-lt1)] += 1.0
else:
fmap['span:source'] += ls2-ls1
fmap['span:target'] += lt2-lt1
# embedding features
if ls1 != None and embeddings:
# inside embeddings
avg_embedding = np.zeros(128)
skipped = 0
#print("Span: {}-{}".format(ls1, ls2))
for offset in range(ls2-ls1):
#print("Indeces: {}-{}".format(ls1+offset, ls1+offset+1))
src_word = get_source_word(src_fsa, ls1+offset, ls1+offset+1)
try:
avg_embedding += embeddings[src_word]
except KeyError:
print ("Word {} not found in Word2Vec".format(src_word))
skipped += 1
if ls2-ls1-skipped > 0:
avg_embedding /= (ls2-ls1-skipped)
for dimension in range(len(avg_embedding)):
fmap['inside:emb:%i' % (dimension)] += avg_embedding[dimension]
# outside embeddings
src_len = src_fsa.nb_states()-1
left = False
right = False
if ls1 > 0: left = True
if ls2 < src_len: right = True
# left
if left:
left_avg_embedding = np.zeros(128)
skipped = 0
for offset in range (ls1):
src_word = get_source_word(src_fsa, offset, offset+1)
try:
left_avg_embedding += embeddings[src_word]
except KeyError:
print ("Word {} not found in Word2Vec".format(src_word))
skipped += 1
if ls2-ls1-skipped > 0:
left_avg_embedding /= (ls1-skipped)
# right
if right:
right_avg_embedding = np.zeros(128)
skipped = 0
for offset in range (src_len-ls2):
src_word = get_source_word(src_fsa, ls2+offset, ls2+offset+1)
try:
right_avg_embedding += embeddings[src_word]
except KeyError:
print ("Word {} not found in Word2Vec".format(src_word))
skipped += 1
if ls2-ls1-skipped > 0:
right_avg_embedding /= (src_len-ls2-skipped)
if left and right:
avg_embedding = (left_avg_embedding + right_avg_embedding) / 2
elif left:
avg_embedding = left_avg_embedding
elif right:
avg_embedding = right_avg_embedding
for dimension in range(len(avg_embedding)):
fmap['outside:emb:%i' % (dimension)] += avg_embedding[dimension]
# Skip-Bigram features
src_len = src_fsa.nb_states()-1
if ls1 != None and src_len > 1 and skip_grams:
first = None
skip = None
for offset in range(ls2-ls1):
src_word = get_source_word(src_fsa, ls1+offset, ls1+offset+1)
if first != None and skip != None:
fmap['skip:%s*%s' % (first, skip)] += 1
fmap['skip:%s*%s' % (first, src_word)] += 1
first = skip
skip = src_word
fmap['skip:%s*%s' % (first, skip)] += 1
return fmap