-
Notifications
You must be signed in to change notification settings - Fork 0
/
translate.py
executable file
·595 lines (505 loc) · 22.4 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Translates a source file using a translation model.
'''
import sys
import numpy
import json
import os
import logging
from multiprocessing import Process, Queue
from collections import defaultdict
from Queue import Empty
from util import load_dict, load_config, seqs2words
from compat import fill_options
from hypgraph import HypGraphRenderer
from settings import TranslationSettings
class Translation(object):
#TODO move to separate file?
"""
Models a translated segment.
"""
def __init__(self, source_words, target_words, sentence_id=None, score=0, alignment=None,
target_probs=None, hyp_graph=None, hypothesis_id=None):
self.source_words = source_words
self.target_words = target_words
self.sentence_id = sentence_id
self.score = score
self.alignment = alignment #TODO: assertion of length?
self.target_probs = target_probs #TODO: assertion of length?
self.hyp_graph = hyp_graph
self.hypothesis_id = hypothesis_id
def get_alignment(self):
return self.alignment
def get_alignment_text(self):
"""
Returns this translation's alignment rendered as a string.
Columns in header: sentence id ||| target words ||| score |||
source words ||| number of source words |||
number of target words
"""
columns = [
self.sentence_id,
" ".join(self.target_words),
self.score,
" ".join(self.source_words),
len(self.source_words) + 1,
len(self.target_words) + 1
]
header = "{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n".format(*columns)
matrix = []
for target_word_alignment in self.alignment:
current_weights = []
for weight in target_word_alignment:
current_weights.append(str(weight))
matrix.append(" ".join(current_weights))
return header + "\n".join(matrix)
def get_alignment_json(self, as_string=True):
"""
Returns this translation's alignment as a JSON serializable object
(@param as_string False) or a JSON formatted string (@param as_string
True).
"""
source_tokens = self.source_words + ["</s>"]
target_tokens = self.target_words + ["</s>"]
if self.hypothesis_id is not None:
tid = self.sentence_id + self.hypothesis_id
else:
tid = self.sentence_id
links = []
for target_index, target_word_alignment in enumerate(self.alignment):
for source_index, weight in enumerate(target_word_alignment):
links.append(
(target_tokens[target_index],
source_tokens[source_index],
str(weight),
self.sentence_id,
tid)
)
return json.dumps(links, ensure_ascii=False, indent=2) if as_string else links
def get_target_probs(self):
"""
Returns this translation's word probabilities as a string.
"""
return " ".join("{0}".format(prob) for prob in self.target_probs)
def save_hyp_graph(self, filename, word_idict_trg, detailed=True, highlight_best=True):
"""
Writes this translation's search graph to disk.
"""
if self.hyp_graph:
renderer = HypGraphRenderer(self.hyp_graph)
renderer.wordify(word_idict_trg)
renderer.save(filename, detailed, highlight_best)
else:
pass #TODO: Warning if no search graph has been constructed during decoding?
class QueueItem(object):
"""
Models items in a queue.
"""
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
class Translator(object):
def __init__(self, settings):
"""
Loads translation models.
"""
self._models = settings.models
self._num_processes = settings.num_processes
self._device_list = settings.device_list
self._verbose = settings.verbose
self._retrieved_translations = defaultdict(dict)
# load model options
self._load_model_options()
# load and invert dictionaries
self._build_dictionaries()
# set up queues
self._init_queues()
# init worker processes
self._init_processes()
def _load_model_options(self):
"""
Loads config options for each model.
"""
options = []
for model in self._models:
m = load_config(model)
if not 'concatenate_lm_decoder' in m:
m['concatenate_lm_decoder'] = False
options.append(m)
# backward compatibility
fill_options(options[-1])
self._options = options
def _build_dictionaries(self):
"""
Builds and inverts source and target dictionaries, taken
from the first model since all of them must have the same
vocabulary.
"""
dictionaries = self._options[0]['dictionaries']
dictionaries_source = dictionaries[:-1]
dictionary_target = dictionaries[-1]
# load and invert source dictionaries
word_dicts = []
word_idicts = []
for dictionary in dictionaries_source:
word_dict = load_dict(dictionary)
if self._options[0]['n_words_src']:
for key, idx in word_dict.items():
if idx >= self._options[0]['n_words_src']:
del word_dict[key]
word_idict = dict()
for kk, vv in word_dict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
word_dicts.append(word_dict)
word_idicts.append(word_idict)
self._word_dicts = word_dicts
self._word_idicts = word_idicts
# load and invert target dictionary
word_dict_trg = load_dict(dictionary_target)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'
self._word_idict_trg = word_idict_trg
def _init_queues(self):
"""
Sets up shared queues for inter-process communication.
"""
self._input_queue = Queue()
self._output_queue = Queue()
def shutdown(self):
"""
Executed from parent process to terminate workers,
method: "poison pill".
"""
for process in self._processes:
self._input_queue.put(None)
def _init_processes(self):
"""
Starts child (worker) processes.
"""
processes = [None] * self._num_processes
for process_id in xrange(self._num_processes):
deviceid = ''
if self._device_list is not None and len(self._device_list) != 0:
deviceid = self._device_list[process_id % len(self._device_list)].strip()
processes[process_id] = Process(
target=self._start_worker,
args=(process_id, deviceid)
)
processes[process_id].start()
self._processes = processes
### MODEL LOADING AND TRANSLATION IN CHILD PROCESS ###
def _load_theano(self):
"""
Loads models, sets theano shared variables and builds samplers.
This entails irrevocable binding to a specific GPU.
"""
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano import shared
from nmt import (build_sampler, gen_sample)
from theano_util import (numpy_floatX, load_params, init_theano_params)
trng = RandomStreams(1234)
use_noise = shared(numpy_floatX(0.))
fs_init = []
fs_next = []
for model, option in zip(self._models, self._options):
param_list = numpy.load(model).files
param_list = dict.fromkeys(
[key for key in param_list if not key.startswith('adam_')], 0)
params = load_params(model, param_list)
tparams = init_theano_params(params)
# always return alignment at this point
f_init, f_next = build_sampler(
tparams, option, use_noise, trng, return_alignment=True)
fs_init.append(f_init)
fs_next.append(f_next)
return trng, fs_init, fs_next, gen_sample
def _set_device(self, device_id):
"""
Modifies environment variable to change the THEANO device.
"""
if device_id != '':
try:
theano_flags = os.environ['THEANO_FLAGS'].split(',')
exist = False
for i in xrange(len(theano_flags)):
if theano_flags[i].strip().startswith('device'):
exist = True
theano_flags[i] = '%s=%s' % ('device', device_id)
break
if exist is False:
theano_flags.append('%s=%s' % ('device', device_id))
os.environ['THEANO_FLAGS'] = ','.join(theano_flags)
except KeyError:
# environment variable does not exist at all
os.environ['THEANO_FLAGS'] = 'device=%s' % device_id
def _load_models(self, process_id, device_id):
"""
Modifies environment variable to change the THEANO device, then loads
models and returns them.
"""
logging.debug("Process '%s' - Loading models on device %s\n" % (process_id, device_id))
# modify environment flag 'device'
self._set_device(device_id)
# build and return models
return self._load_theano()
def _start_worker(self, process_id, device_id):
"""
Function executed by each worker once started. Do not execute in
the parent process.
"""
# load theano functionality
trng, fs_init, fs_next, gen_sample = self._load_models(process_id, device_id)
# listen to queue in while loop, translate items
while True:
input_item = self._input_queue.get()
if input_item is None:
break
idx = input_item.idx
request_id = input_item.request_id
output_item = self._translate(process_id, input_item, trng, fs_init, fs_next, gen_sample)
self._output_queue.put((request_id, idx, output_item))
return
def _translate(self, process_id, input_item, trng, fs_init, fs_next, gen_sample):
"""
Actual translation (model sampling).
"""
# unpack input item attributes
normalization_alpha = input_item.normalization_alpha
nbest = input_item.nbest
idx = input_item.idx
# logging
logging.debug('{0} - {1}\n'.format(process_id, idx))
# sample given an input sequence and obtain scores
sample, score, word_probs, alignment, hyp_graph = self._sample(input_item, trng, fs_init, fs_next, gen_sample)
# normalize scores according to sequence lengths
if normalization_alpha:
adjusted_lengths = numpy.array([len(s) ** normalization_alpha for s in sample])
score = score / adjusted_lengths
if nbest is True:
output_item = sample, score, word_probs, alignment, hyp_graph
else:
# return translation with lowest score only
sidx = numpy.argmin(score)
output_item = sample[sidx], score[sidx], word_probs[
sidx], alignment[sidx], hyp_graph
return output_item
def _sample(self, input_item, trng, fs_init, fs_next, gen_sample):
"""
Sample from model.
"""
# unpack input item attributes
return_hyp_graph = input_item.return_hyp_graph
return_alignment = input_item.return_alignment
suppress_unk = input_item.suppress_unk
k = input_item.k
seq = input_item.seq
max_ratio = input_item.max_ratio
maxlen = 200 #TODO: should be configurable
if max_ratio:
maxlen = int(max_ratio * len(seq))
return gen_sample(fs_init, fs_next,
numpy.array(seq).T.reshape(
[len(seq[0]), len(seq), 1]),
self._options[0],
trng=trng, k=k, maxlen=maxlen,
stochastic=False, argmax=False,
return_alignment=return_alignment,
suppress_unk=suppress_unk,
return_hyp_graph=return_hyp_graph)
### WRITING TO AND READING FROM QUEUES ###
def _send_jobs(self, input_, translation_settings):
"""
"""
source_sentences = []
for idx, line in enumerate(input_):
if translation_settings.char_level:
words = list(line.decode('utf-8').strip())
else:
words = line.strip().split()
x = []
for w in words:
if self._options[0]['factors'] > 1:
w = [self._word_dicts[i][f] if f in self._word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))]
else:
w = [self._word_dicts[0][w] if w in self._word_dicts[0] else 1]
if len(w) != self._options[0]['factors']:
logging.warning('Expected {0} factors, but input word has {1}\n'.format(self._options[0]['factors'], len(w)))
for midx in xrange(self._num_processes):
self._processes[midx].terminate()
sys.exit(1)
x.append(w)
x += [[0]*self._options[0]['factors']]
input_item = QueueItem(verbose=self._verbose,
return_hyp_graph=translation_settings.get_search_graph,
return_alignment=translation_settings.get_alignment,
k=translation_settings.beam_width,
suppress_unk=translation_settings.suppress_unk,
normalization_alpha=translation_settings.normalization_alpha,
nbest=translation_settings.n_best,
max_ratio=translation_settings.max_ratio,
seq=x,
idx=idx,
request_id=translation_settings.request_id)
self._input_queue.put(input_item)
source_sentences.append(words)
return idx+1, source_sentences
def _retrieve_jobs(self, num_samples, request_id, timeout=5):
"""
"""
while len(self._retrieved_translations[request_id]) < num_samples:
resp = None
while resp is None:
try:
resp = self._output_queue.get(True, timeout)
# if queue is empty after 5s, check if processes are still alive
except Empty:
for midx in xrange(self._num_processes):
if not self._processes[midx].is_alive() and self._processes[midx].exitcode != 0:
# kill all other processes and raise exception if one dies
self._input_queue.cancel_join_thread()
self._output_queue.cancel_join_thread()
for idx in xrange(self._num_processes):
self._processes[idx].terminate()
logging.error("Translate worker process {0} crashed with exitcode {1}".format(self._processes[midx].pid, self._processes[midx].exitcode))
sys.exit(1)
request_id, idx, output_item = resp
self._retrieved_translations[request_id][idx] = output_item
#print self._retrieved_translations
for idx in xrange(num_samples):
yield self._retrieved_translations[request_id][idx]
# then remove all entries with this request ID from the dictionary
del self._retrieved_translations[request_id]
### EXPOSED TRANSLATION FUNCTIONS ###
def translate(self, source_segments, translation_settings):
"""
Returns the translation of @param source_segments.
"""
logging.info('Translating {0} segments...\n'.format(len(source_segments)))
n_samples, source_sentences = self._send_jobs(source_segments, translation_settings)
translations = []
for i, trans in enumerate(self._retrieve_jobs(n_samples, translation_settings.request_id)):
samples, scores, word_probs, alignment, hyp_graph = trans
# n-best list
if translation_settings.n_best is True:
order = numpy.argsort(scores)
n_best_list = []
for j in order:
current_alignment = None if not translation_settings.get_alignment else alignment[j]
translation = Translation(sentence_id=i,
source_words=source_sentences[i],
target_words=seqs2words(samples[j], self._word_idict_trg, join=False),
score=scores[j],
alignment=current_alignment,
target_probs=word_probs[j],
hyp_graph=hyp_graph,
hypothesis_id=j)
n_best_list.append(translation)
translations.append(n_best_list)
# single-best translation
else:
current_alignment = None if not translation_settings.get_alignment else alignment
translation = Translation(sentence_id=i,
source_words=source_sentences[i],
target_words=seqs2words(samples, self._word_idict_trg, join=False),
score=scores,
alignment=current_alignment,
target_probs=word_probs,
hyp_graph=hyp_graph)
translations.append(translation)
return translations
def translate_file(self, input_object, translation_settings):
"""
"""
source_segments = input_object.readlines()
return self.translate(source_segments, translation_settings)
def translate_string(self, segment, translation_settings):
"""
Translates a single segment
"""
if not segment.endswith('\n'):
segment += '\n'
source_segments = [segment]
return self.translate(source_segments, translation_settings)
def translate_list(self, segments, translation_settings):
"""
Translates a list of segments
"""
source_segments = [s + '\n' if not s.endswith('\n') else s for s in segments]
return self.translate(source_segments, translation_settings)
### FUNCTIONS FOR WRITING THE RESULTS ###
def write_alignment(self, translation, translation_settings):
"""
Writes alignments to a file.
"""
output_file = translation_settings.output_alignment
if translation_settings.json_alignment:
output_file.write(translation.get_alignment_json() + "\n")
else:
output_file.write(translation.get_alignment_text() + "\n\n")
def write_translation(self, output_file, translation, translation_settings):
"""
Writes a single translation to a file or STDOUT.
"""
output_items = []
# sentence ID only for nbest
if translation_settings.n_best is True:
output_items.append(str(translation.sentence_id))
# translations themselves
output_items.append(" ".join(translation.target_words))
# write scores for nbest?
if translation_settings.n_best is True:
output_items.append(str(translation.score))
# write probabilities?
if translation_settings.get_word_probs:
output_items.append(translation.get_target_probs())
if translation_settings.n_best is True:
output_file.write(" ||| ".join(output_items) + "\n")
else:
output_file.write("\n".join(output_items) + "\n")
# write alignments to file?
if translation_settings.get_alignment:
self.write_alignment(translation, translation_settings)
# construct hypgraph?
if translation_settings.get_search_graph:
translation.save_hyp_graph(
translation_settings.search_graph_filename,
self._word_idict_trg,
detailed=True,
highlight_best=True
)
def write_translations(self, output_file, translations, translation_settings):
"""
Writes translations to a file or STDOUT.
"""
if translation_settings.n_best is True:
for nbest_list in translations:
for translation in nbest_list:
self.write_translation(output_file, translation, translation_settings)
else:
for translation in translations:
self.write_translation(output_file, translation, translation_settings)
def main(input_file, output_file, translation_settings):
"""
Translates a source language file (or STDIN) into a target language file
(or STDOUT).
"""
translator = Translator(translation_settings)
translations = translator.translate_file(input_file, translation_settings)
translator.write_translations(output_file, translations, translation_settings)
logging.info('Done')
translator.shutdown()
if __name__ == "__main__":
# parse console arguments
translation_settings = TranslationSettings(from_console_arguments=True)
input_file = translation_settings.input
output_file = translation_settings.output
# start logging
level = logging.DEBUG if translation_settings.verbose else logging.WARNING
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
main(input_file, output_file, translation_settings)