/
conll17.py
288 lines (256 loc) · 12.9 KB
/
conll17.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
r"""Block&script eval.Conll17 for evaluating LAS,UAS,etc as in CoNLL2017 UD shared task.
This is a reimplementation of the CoNLL2017 shared task official evaluation script,
http://universaldependencies.org/conll17/evaluation.html
The gold trees and predicted (system-output) trees need to be sentence-aligned
e.g. using `util.ResegmentGold`.
Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization.
An example usage and output::
$ udapy read.Conllu zone=gold files=gold.conllu \
read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \
eval.Conll17
Metric | Precision | Recall | F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Words | 27.91 | 52.17 | 36.36 | 100.00
UPOS | 27.91 | 52.17 | 36.36 | 100.00
XPOS | 27.91 | 52.17 | 36.36 | 100.00
Feats | 27.91 | 52.17 | 36.36 | 100.00
Lemma | 27.91 | 52.17 | 36.36 | 100.00
UAS | 16.28 | 30.43 | 21.21 | 58.33
LAS | 16.28 | 30.43 | 21.21 | 58.33
CLAS | 10.34 | 16.67 | 12.77 | 37.50
For evaluating multiple systems and testsets (as in CoNLL2017)
stored in `systems/system_name/testset_name.conllu` you can use::
#!/bin/bash
SYSTEMS=`ls systems`
[[ $# -ne 0 ]] && SYSTEMS=$@
set -x
set -e
for sys in $SYSTEMS; do
mkdir -p results/$sys
for testset in `ls systems/$sys`; do
udapy read.Conllu zone=gold files=gold/$testset \
read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \
util.ResegmentGold \
eval.Conll17 print_results=0 print_raw=1 \
> results/$sys/${testset%.conllu}
done
done
python3 `python3 -c 'import udapi.block.eval.conll17 as x; print(x.__file__)'` -r 100
The last line executes this block as a script and computes bootstrap resampling with 100 resamples
(default=1000, it is recommended to keep the default or higher value unless testing the interface).
This prints the ranking and confidence intervals (95% by default) and also p-values for each
pair of systems with neighboring ranks. If the difference in LAS is significant
(according to a paired bootstrap test, by default if p < 0.05),
a line is printed between the two systems.
The output looks like::
1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001
------------------------------------------------------------
2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001
------------------------------------------------------------
3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001
------------------------------------------------------------
4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001
------------------------------------------------------------
5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001
------------------------------------------------------------
6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001
------------------------------------------------------------
7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002
------------------------------------------------------------
8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001
------------------------------------------------------------
9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003
------------------------------------------------------------
10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448
11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029
------------------------------------------------------------
12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334
13. conll17-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003
------------------------------------------------------------
14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062
15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283
16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121
17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445
18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166
19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447
20. naistCL 67.49 ± 0.15 (67.34 .. 67.63)
TODO: Bootstrap currently reports only LAS, but all the other measures could be added as well.
"""
import argparse
import difflib
import logging
import os
import random
import sys
from collections import Counter
from udapi.core.basewriter import BaseWriter
CLAS_IGNORE = {'aux', 'case', 'cc', 'clf', 'cop', 'det', 'mark', 'punct'}
class Conll17(BaseWriter):
"""Evaluate labeled and unlabeled attachment score (LAS and UAS)."""
def __init__(self, gold_zone='gold', print_raw=False, print_results=True, **kwargs):
"""Args:
gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")?
print_raw - Print raw counts (pred, gold, Words, LAS) for each sentence.
This is useful for bootstrap resampling post-processing to get confidence intervals.
print_results - Print a table with overall results after all document are processed.
"""
super().__init__(**kwargs)
self.gold_zone = gold_zone
self.total_count = Counter()
self.print_raw = print_raw
self.print_results = print_results
def process_tree(self, tree):
gold_tree = tree.bundle.get_tree(self.gold_zone)
if tree == gold_tree:
return
pred_nodes = tree.descendants
gold_nodes = gold_tree.descendants
pred_forms = [n.form.lower() for n in pred_nodes]
gold_forms = [n.form.lower() for n in gold_nodes]
matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False)
aligned = []
for diff in matcher.get_opcodes():
edit, pred_lo, pred_hi, gold_lo, gold_hi = diff
if edit == 'equal':
aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]))
align_map = {tree: gold_tree}
for p_node, g_node in aligned:
align_map[p_node] = g_node
count = Counter()
count['pred'] = len(pred_nodes)
count['gold'] = len(gold_nodes)
count['Words'] = len(aligned)
count['pred_clas'] = len([n for n in pred_nodes if n.udeprel not in CLAS_IGNORE])
count['gold_clas'] = len([n for n in gold_nodes if n.udeprel not in CLAS_IGNORE])
count['alig_clas'] = len([n for _, n in aligned if n.udeprel not in CLAS_IGNORE])
for p_node, g_node in aligned:
for attr in ('UPOS', 'XPOS', 'Feats', 'Lemma'):
if p_node.get_attrs([attr.lower()]) == g_node.get_attrs([attr.lower()]):
count[attr] += 1
if align_map.get(p_node.parent) == g_node.parent:
count['UAS'] += 1
if p_node.udeprel == g_node.udeprel:
count['LAS'] += 1
if g_node.udeprel not in CLAS_IGNORE:
count['CLAS'] += 1
self.total_count.update(count)
if self.print_raw:
scores = [str(count[s]) for s in ('pred', 'gold', 'Words', 'LAS')]
print(' '.join(scores))
def process_end(self):
if not self.print_results:
return
# Redirect the default filehandle to the file specified by self.files
self.before_process_document(None)
metrics = ('Words', 'UPOS', 'XPOS', 'Feats', 'Lemma', 'UAS', 'LAS', 'CLAS')
print("Metric | Precision | Recall | F1 Score | AligndAcc")
print("-----------+-----------+-----------+-----------+-----------")
pred, gold = self.total_count['pred'], self.total_count['gold']
alig = self.total_count['Words']
for metric in metrics:
if metric == 'CLAS':
pred, gold = self.total_count['pred_clas'], self.total_count['gold_clas']
alig = self.total_count['alig_clas']
correct = self.total_count[metric]
precision = correct / pred if pred else 0
recall = correct / gold if gold else 0
alignacc = correct / alig if alig else 0
fscore = 2 * correct / (pred + gold) if pred + gold else 0
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{:10.2f}".format(
metric, 100 * precision, 100 * recall, 100 * fscore, 100 * alignacc))
def prec_rec_f1(correct, pred, gold, alig=0):
precision = correct / pred if pred else 0
recall = correct / gold if gold else 0
alignacc = correct / alig if alig else 0
fscore = 2 * correct / (pred + gold) if pred + gold else 0
return precision, recall, fscore, alignacc
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dir_results", "-d", default="results", help="directory with results")
parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples")
parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval")
parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets")
parser.add_argument("--systems", "-s", default='all', help="comma-separated systems")
parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time")
args = parser.parse_args()
res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence
alpha = (1 - conf/100) / 2
index_lo = int(alpha * (resamples - 1))
index_hi = resamples - 1 - index_lo
index_mid = int(resamples / 2)
if args.systems == 'all':
systems = os.listdir(res_dir)
else:
systems = args.systems.split(',')
if args.tests == 'all':
tests = set()
for system in systems:
tests.update(os.listdir(res_dir + '/' + system))
tests = sorted(tests)
else:
tests = args.tests.split(',')
if args.randseed:
random.seed(args.randseed)
results = []
print('Loading...', file=sys.stderr)
for system in systems:
sys_results = []
results.append(sys_results)
for i_test, test in enumerate(tests):
filename = '/'.join((res_dir, system, test))
try:
with open(filename) as res_file:
sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file])
except FileNotFoundError:
logging.warning(filename + ' not found')
samples = len(sys_results)
print('Resampling...', file=sys.stderr)
boot_results = []
for i_resample in range(resamples):
print(i_resample + 1, file=sys.stderr, end='\r')
resample_results = []
boot_results.append(resample_results)
for i_system in range(len(systems)):
pred, gold, words, las = ([0] * len(tests) for _ in range(4))
for _ in range(samples):
i_test, pre, gol, wor, la_ = random.choice(results[i_system])
pred[i_test] += pre
gold[i_test] += gol
words[i_test] += wor
las[i_test] += la_
fscore_sum = 0
for i_test in range(len(tests)):
_prec, _rec, fscore, _aligacc = prec_rec_f1(las[i_test], pred[i_test], gold[i_test])
fscore_sum += fscore
resample_results.append(fscore_sum / len(tests))
print('\n', file=sys.stderr)
sys_fscores = []
for i_system, system in enumerate(systems):
sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)])
final_results = []
sys_sys_wins = [[0] * len(systems) for x in range(len(systems))]
for i_system, system in enumerate(systems):
for j_system in range(i_system):
for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]):
if i > j:
sys_sys_wins[i_system][j_system] += 1
elif i < j:
sys_sys_wins[j_system][i_system] += 1
fscores = sorted(sys_fscores[i_system])
final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]])
sorted_systems = sorted(final_results, key=lambda x: -x[1])
for rank, sys_results in enumerate(sorted_systems):
i_system, f1_mid, f1_lo, f1_hi = sys_results
if rank < len(systems) - 1:
j_worse_sys = sorted_systems[rank + 1][0]
p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1)
p_str = " p=%.3f" % p_value
else:
p_value, p_str = 1, ""
print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" %
(rank + 1, systems[i_system],
100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str))
if p_value < (1 - conf/100):
print('-' * 60)
if __name__ == "__main__":
main()