/
bleu.py
420 lines (345 loc) · 17.1 KB
/
bleu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
"""The implementation of the BLEU metric (Papineni et al., 2002)."""
import math
import logging
from importlib import import_module
from typing import List, Sequence, Optional, Dict, Any
from ..utils import my_log, sum_of_lists
from .base import Score, Signature, Metric
from .helpers import extract_all_word_ngrams
sacrelogger = logging.getLogger('sacrebleu')
# The default for the maximum n-gram order when computing precisions
MAX_NGRAM_ORDER = 4
_TOKENIZERS = {
'none': 'tokenizer_none.NoneTokenizer',
'zh': 'tokenizer_zh.TokenizerZh',
'13a': 'tokenizer_13a.Tokenizer13a',
'intl': 'tokenizer_intl.TokenizerV14International',
'char': 'tokenizer_char.TokenizerChar',
'ja-mecab': 'tokenizer_ja_mecab.TokenizerJaMecab',
'ko-mecab': 'tokenizer_ko_mecab.TokenizerKoMecab',
'spm': 'tokenizer_spm.TokenizerSPM',
'flores101': 'tokenizer_spm.Flores101Tokenizer',
'flores200': 'tokenizer_spm.Flores200Tokenizer',
}
def _get_tokenizer(name: str):
"""Dynamically import tokenizer as importing all is slow."""
module_name, class_name = _TOKENIZERS[name].rsplit('.', 1)
return getattr(
import_module(f'.tokenizers.{module_name}', 'sacrebleu'),
class_name)
class BLEUSignature(Signature):
"""A convenience class to represent the reproducibility signature for BLEU.
:param args: key-value dictionary passed from the actual metric instance.
"""
def __init__(self, args: dict):
"""`BLEUSignature` initializer."""
super().__init__(args)
self._abbr.update({
'case': 'c',
'eff': 'e',
'tok': 'tok',
'smooth': 's',
})
# Construct a combined string for smoothing method and value
smooth_str = args['smooth_method']
smooth_def = BLEU.SMOOTH_DEFAULTS[smooth_str]
# If the method requires a parameter, add it within brackets
if smooth_def is not None:
# the following can be None if the user wants to use the default
smooth_val = args['smooth_value']
if smooth_val is None:
smooth_val = smooth_def
smooth_str += f'[{smooth_val:.2f}]'
self.info.update({
'case': 'lc' if args['lowercase'] else 'mixed',
'eff': 'yes' if args['effective_order'] else 'no',
'tok': args['tokenizer_signature'],
'smooth': smooth_str,
})
class BLEUScore(Score):
"""A convenience class to represent BLEU scores.
:param score: The BLEU score.
:param counts: List of counts of correct ngrams, 1 <= n <= max_ngram_order
:param totals: List of counts of total ngrams, 1 <= n <= max_ngram_order
:param precisions: List of precisions, 1 <= n <= max_ngram_order
:param bp: The brevity penalty.
:param sys_len: The cumulative system length.
:param ref_len: The cumulative reference length.
"""
def __init__(self, score: float, counts: List[int], totals: List[int],
precisions: List[float], bp: float,
sys_len: int, ref_len: int):
"""`BLEUScore` initializer."""
super().__init__('BLEU', score)
self.bp = bp
self.counts = counts
self.totals = totals
self.sys_len = sys_len
self.ref_len = ref_len
self.precisions = precisions
self.prec_str = "/".join([f"{p:.1f}" for p in self.precisions])
self.ratio = self.sys_len / self.ref_len if self.ref_len else 0
# The verbose part of BLEU
self._verbose = f"{self.prec_str} (BP = {self.bp:.3f} "
self._verbose += f"ratio = {self.ratio:.3f} hyp_len = {self.sys_len:d} "
self._verbose += f"ref_len = {self.ref_len:d})"
class BLEU(Metric):
"""Computes the BLEU metric given hypotheses and references.
:param lowercase: If True, lowercased BLEU is computed.
:param force: Ignore data that looks already tokenized.
:param tokenize: The tokenizer to use. If None, defaults to language-specific tokenizers with '13a' as the fallback default.
:param smooth_method: The smoothing method to use ('floor', 'add-k', 'exp' or 'none').
:param smooth_value: The smoothing value for `floor` and `add-k` methods. `None` falls back to default value.
:param max_ngram_order: If given, it overrides the maximum n-gram order (default: 4) when computing precisions.
:param effective_order: If `True`, stop including n-gram orders for which precision is 0. This should be
`True`, if sentence-level BLEU will be computed.
:param trg_lang: An optional language code to raise potential tokenizer warnings.
:param references: A sequence of reference documents with document being
defined as a sequence of reference strings. If given, the reference n-grams
and lengths will be pre-computed and cached for faster BLEU computation
across many systems.
"""
SMOOTH_DEFAULTS: Dict[str, Optional[float]] = {
# The defaults for `floor` and `add-k` are obtained from the following paper
# A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU
# Boxing Chen and Colin Cherry
# http://aclweb.org/anthology/W14-3346
'none': None, # No value is required
'floor': 0.1,
'add-k': 1,
'exp': None, # No value is required
}
TOKENIZERS = _TOKENIZERS.keys()
# mteval-v13a.pl tokenizer unless Chinese or Japanese is provided
TOKENIZER_DEFAULT = '13a'
# Some language specific mappings to use if `trg_lang` is given
# and the tokenizer is not explicitly specified
_TOKENIZER_MAP = {
'zh': 'zh',
'ja': 'ja-mecab',
'ko': 'ko-mecab',
}
_SIGNATURE_TYPE = BLEUSignature
def __init__(self, lowercase: bool = False,
force: bool = False,
tokenize: Optional[str] = None,
smooth_method: str = 'exp',
smooth_value: Optional[float] = None,
max_ngram_order: int = MAX_NGRAM_ORDER,
effective_order: bool = False,
trg_lang: str = '',
references: Optional[Sequence[Sequence[str]]] = None):
"""`BLEU` initializer."""
super().__init__()
self._force = force
self.trg_lang = trg_lang
self.lowercase = lowercase
self.smooth_value = smooth_value
self.smooth_method = smooth_method
self.max_ngram_order = max_ngram_order
self.effective_order = effective_order
# Sanity check
assert self.smooth_method in self.SMOOTH_DEFAULTS.keys(), \
"Unknown smooth_method {self.smooth_method!r}"
# If the tokenizer wasn't specified, choose it according to the
# following logic. We use 'v13a' except for ZH and JA. Note that
# this logic can only be applied when sacrebleu knows the target
# language, which is only the case for builtin datasets.
if tokenize is None:
best_tokenizer = self.TOKENIZER_DEFAULT
# Set `zh` or `ja-mecab` or `ko-mecab` if target language is provided
if self.trg_lang in self._TOKENIZER_MAP:
best_tokenizer = self._TOKENIZER_MAP[self.trg_lang]
else:
best_tokenizer = tokenize
if self.trg_lang == 'zh' and best_tokenizer != 'zh':
sacrelogger.warning(
"Consider using the 'zh' or 'spm' tokenizer for Chinese.")
if self.trg_lang == 'ja' and best_tokenizer != 'ja-mecab':
sacrelogger.warning(
"Consider using the 'ja-mecab' or 'spm' tokenizer for Japanese.")
if self.trg_lang == 'ko' and best_tokenizer != 'ko-mecab':
sacrelogger.warning(
"Consider using the 'ko-mecab' or 'spm' tokenizer for Korean.")
# Create the tokenizer
self.tokenizer = _get_tokenizer(best_tokenizer)()
# Build the signature
self.tokenizer_signature = self.tokenizer.signature()
if references is not None:
# Pre-compute reference ngrams and lengths
self._ref_cache = self._cache_references(references)
@staticmethod
def compute_bleu(correct: List[int],
total: List[int],
sys_len: int,
ref_len: int,
smooth_method: str = 'none',
smooth_value=None,
effective_order: bool = False,
max_ngram_order: int = MAX_NGRAM_ORDER) -> BLEUScore:
"""Computes BLEU score from its sufficient statistics with smoothing.
Smoothing methods (citing "A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU",
Boxing Chen and Colin Cherry, WMT 2014: http://aclweb.org/anthology/W14-3346)
- none: No smoothing.
- floor: Method 1 (requires small positive value (0.1 in the paper) to be set)
- add-k: Method 2 (Generalizing Lin and Och, 2004)
- exp: Method 3 (NIST smoothing method i.e. in use with mteval-v13a.pl)
:param correct: List of counts of correct ngrams, 1 <= n <= max_ngram_order
:param total: List of counts of total ngrams, 1 <= n <= max_ngram_order
:param sys_len: The cumulative system length
:param ref_len: The cumulative reference length
:param smooth_method: The smoothing method to use ('floor', 'add-k', 'exp' or 'none')
:param smooth_value: The smoothing value for `floor` and `add-k` methods. `None` falls back to default value.
:param effective_order: If `True`, stop including n-gram orders for which precision is 0. This should be
`True`, if sentence-level BLEU will be computed.
:param max_ngram_order: If given, it overrides the maximum n-gram order (default: 4) when computing precisions.
:return: A `BLEUScore` instance.
"""
assert smooth_method in BLEU.SMOOTH_DEFAULTS.keys(), \
"Unknown smooth_method {smooth_method!r}"
# Fetch the default value for floor and add-k
if smooth_value is None:
smooth_value = BLEU.SMOOTH_DEFAULTS[smooth_method]
# Compute brevity penalty
if sys_len < ref_len:
bp = math.exp(1 - ref_len / sys_len) if sys_len > 0 else 0.0
else:
bp = 1.0
# n-gram precisions
precisions = [0.0 for x in range(max_ngram_order)]
# Early stop if there are no matches (#141)
if not any(correct):
return BLEUScore(0.0, correct, total, precisions, bp, sys_len, ref_len)
smooth_mteval = 1.
eff_order = max_ngram_order
for n in range(1, len(precisions) + 1):
if smooth_method == 'add-k' and n > 1:
correct[n - 1] += smooth_value
total[n - 1] += smooth_value
if total[n - 1] == 0:
break
# If the system guesses no i-grams, 1 <= i <= max_ngram_order,
# the BLEU score is 0 (technically undefined). This is a problem for sentence
# level BLEU or a corpus of short sentences, where systems will get
# no credit if sentence lengths fall under the max_ngram_order threshold.
# This fix scales max_ngram_order to the observed maximum order.
# It is only available through the API and off by default
if effective_order:
eff_order = n
if correct[n - 1] == 0:
if smooth_method == 'exp':
smooth_mteval *= 2
precisions[n - 1] = 100. / (smooth_mteval * total[n - 1])
elif smooth_method == 'floor':
precisions[n - 1] = 100. * smooth_value / total[n - 1]
else:
precisions[n - 1] = 100. * correct[n - 1] / total[n - 1]
# Compute BLEU score
score = bp * math.exp(
sum([my_log(p) for p in precisions[:eff_order]]) / eff_order)
return BLEUScore(score, correct, total, precisions, bp, sys_len, ref_len)
def _preprocess_segment(self, sent: str) -> str:
"""Given a sentence, lowercases (optionally) and tokenizes it
:param sent: The input sentence string.
:return: The pre-processed output string.
"""
if self.lowercase:
sent = sent.lower()
return self.tokenizer(sent.rstrip())
def _compute_score_from_stats(self, stats: List[int]) -> BLEUScore:
"""Computes the final score from already aggregated statistics.
:param stats: A list or numpy array of segment-level statistics.
:return: A `BLEUScore` object.
"""
return self.compute_bleu(
correct=stats[2: 2 + self.max_ngram_order],
total=stats[2 + self.max_ngram_order:],
sys_len=int(stats[0]), ref_len=int(stats[1]),
smooth_method=self.smooth_method, smooth_value=self.smooth_value,
effective_order=self.effective_order,
max_ngram_order=self.max_ngram_order
)
def _aggregate_and_compute(self, stats: List[List[int]]) -> BLEUScore:
"""Computes the final BLEU score given the pre-computed corpus statistics.
:param stats: A list of segment-level statistics
:return: A `BLEUScore` instance.
"""
return self._compute_score_from_stats(sum_of_lists(stats))
def _get_closest_ref_len(self, hyp_len: int, ref_lens: List[int]) -> int:
"""Given a hypothesis length and a list of reference lengths, returns
the closest reference length to be used by BLEU.
:param hyp_len: The hypothesis length.
:param ref_lens: A list of reference lengths.
:return: The closest reference length.
"""
closest_diff, closest_len = -1, -1
for ref_len in ref_lens:
diff = abs(hyp_len - ref_len)
if closest_diff == -1 or diff < closest_diff:
closest_diff = diff
closest_len = ref_len
elif diff == closest_diff and ref_len < closest_len:
closest_len = ref_len
return closest_len
def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]:
"""Given a list of reference segments, extract the n-grams and reference lengths.
The latter will be useful when comparing hypothesis and reference lengths for BLEU.
:param refs: A sequence of strings.
:return: A dictionary that will be passed to `_compute_segment_statistics()`
through keyword arguments.
"""
ngrams = None
ref_lens = []
for ref in refs:
# extract n-grams for this ref
this_ngrams, ref_len = extract_all_word_ngrams(ref, 1, self.max_ngram_order)
ref_lens.append(ref_len)
if ngrams is None:
# Set it directly for first set of refs
ngrams = this_ngrams
else:
# Merge counts across multiple references
# The below loop is faster than `ngrams |= this_ngrams`
for ngram, count in this_ngrams.items():
ngrams[ngram] = max(ngrams[ngram], count)
return {'ref_ngrams': ngrams, 'ref_lens': ref_lens}
def _compute_segment_statistics(self, hypothesis: str,
ref_kwargs: Dict) -> List[int]:
"""Given a (pre-processed) hypothesis sentence and already computed
reference n-grams & lengths, returns the best match statistics across the
references.
:param hypothesis: Hypothesis sentence.
:param ref_kwargs: A dictionary with `refs_ngrams`and `ref_lens` keys
that denote the counter containing all n-gram counts and reference lengths,
respectively.
:return: A list of integers with match statistics.
"""
ref_ngrams, ref_lens = ref_kwargs['ref_ngrams'], ref_kwargs['ref_lens']
# Extract n-grams for the hypothesis
hyp_ngrams, hyp_len = extract_all_word_ngrams(
hypothesis, 1, self.max_ngram_order)
ref_len = self._get_closest_ref_len(hyp_len, ref_lens)
# Count the stats
# Although counter has its internal & and | operators, this is faster
correct = [0 for i in range(self.max_ngram_order)]
total = correct[:]
for hyp_ngram, hyp_count in hyp_ngrams.items():
# n-gram order
n = len(hyp_ngram) - 1
# count hypothesis n-grams
total[n] += hyp_count
# count matched n-grams
if hyp_ngram in ref_ngrams:
correct[n] += min(hyp_count, ref_ngrams[hyp_ngram])
# Return a flattened list for efficient computation
return [hyp_len, ref_len] + correct + total
def sentence_score(self, hypothesis: str, references: Sequence[str]) -> BLEUScore:
"""Compute the metric for a single sentence against a single (or multiple) reference(s).
:param hypothesis: A single hypothesis string.
:param references: A sequence of reference strings.
:return: a `BLEUScore` object.
"""
if not self.effective_order:
sacrelogger.warning(
'It is recommended to enable `effective_order` for sentence-level BLEU.')
return super().sentence_score(hypothesis, references)