-
Notifications
You must be signed in to change notification settings - Fork 28
/
eflomal.pyx
162 lines (140 loc) · 5.38 KB
/
eflomal.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
cimport cython
from cpython cimport bool
cimport numpy as np
from libc.stdio cimport fprintf, fdopen, fputc, fflush, FILE
import os
import sys
import math
import subprocess
from tempfile import NamedTemporaryFile
import numpy as np
cpdef tuple read_text(pyfile, bool lowercase, int prefix_len, int suffix_len):
"""Read a tokenized text file as a list of indexed sentences.
Optionally transform the vocabulary according to the parameters.
pyfile -- file to read
lowercase -- if True, all tokens are lowercased
prefix_len -- if non-zero, all tokens are cut of after so many characters
suffix_len -- if non-zero, as above, but cutting from the right side
Returns:
a tuple (list sents, dict index) containing the actual sentences and the
string-to-index mapping used.
"""
cdef:
np.ndarray[np.uint32_t, ndim=1] sent
list sents, tokens
str line, token
dict index
int i, n, idx
index = {}
sents = []
for line in pyfile:
if lowercase:
tokens = line.lower().split()
else:
tokens = line.split()
n = len(tokens)
sent = np.empty(n, dtype=np.uint32)
for i in range(n):
token = tokens[i]
if prefix_len != 0: token = token[:prefix_len]
elif suffix_len != 0: token = token[-suffix_len:]
idx = index.get(token, -1)
if idx == -1:
idx = len(index)
index[token] = idx
sent[i] = idx
sents.append(sent)
return (sents, index)
cpdef write_text(pyfile, tuple sents, int voc_size):
"""Write a sequence of sentences in the format expected by eflomal
Arguments:
pyfile -- Python file object to write to
sents -- tuple of sentences, each encoded as np.ndarray(uint32)
voc_size -- size of vocabulary
"""
cdef int token, i, n
cdef FILE *f
cdef np.ndarray[np.uint32_t, ndim=1] sent
f = fdopen(pyfile.fileno(), 'wb')
fprintf(f, '%d %d\n', len(sents), voc_size)
for sent in sents:
n = len(sent)
if n < 0x400:
i = 0
fprintf(f, '%d', n)
while i < n:
fprintf(f, ' %d', sent[i])
i += 1
fputc(10, f)
else:
fputc(48, f)
fputc(10, f)
fflush(f)
def align(
str source_filename,
str target_filename,
str links_filename_fwd=None,
str links_filename_rev=None,
str statistics_filename=None,
str scores_filename_fwd=None,
str scores_filename_rev=None,
str priors_filename=None,
int model=3,
int score_model=0,
tuple n_iterations=None,
int n_samplers=1,
bool quiet=True,
double rel_iterations=1.0,
double null_prior=0.2,
bool use_gdb=False):
"""Call the eflomal binary to perform word alignment
Arguments:
source_filename -- str with source text filename, this and the target
text should both be written using write_text()
target_filename -- str with target text filename
links_filename_fwd -- if given, write links here (forward direction)
links_filename_rev -- if given, write links here (reverse direction)
statistics_filename -- if given, write alignment statistics here
scores_filename -- if given, write sentence alignment scoeres here
priors_filename -- if given, read Dirichlet priors from here
model -- alignment model (1 = IBM1, 2 = HMM, 3 = HMM+fertility)
n_iterations -- 3-tuple with number of iterations per model, if this is
not given the numbers will be computed automatically based
on rel_iterations
n_samplers -- number of independent samplers to run
quiet -- if True, suppress output
rel_iterations -- number of iterations relative to the default
"""
with open(source_filename, 'rb') as f:
n_sentences = int(f.readline().split()[0])
if n_iterations is None:
iters = max(2, int(round(
rel_iterations*5000 / math.sqrt(n_sentences))))
iters4 = max(1, iters//4)
if model == 1:
n_iterations = (iters, 0, 0)
elif model == 2:
n_iterations = (max(2, iters4), iters, 0)
else:
n_iterations = (max(2, iters4), iters4, iters)
executable = os.path.join(os.path.dirname(__file__), 'bin', 'eflomal')
args = [executable,
'-m', str(model),
'-s', source_filename,
'-t', target_filename,
'-n', str(n_samplers),
'-N', str(null_prior),
'-1', str(n_iterations[0])]
if quiet: args.append('-q')
if model >= 2: args.extend(['-2', str(n_iterations[1])])
if model >= 3: args.extend(['-3', str(n_iterations[2])])
if links_filename_fwd: args.extend(['-f', links_filename_fwd])
if links_filename_rev: args.extend(['-r', links_filename_rev])
if statistics_filename: args.extend(['-S', statistics_filename])
if score_model > 0: args.extend(['-M', str(score_model)])
if scores_filename_fwd: args.extend(['-F', scores_filename_fwd])
if scores_filename_rev: args.extend(['-R', scores_filename_rev])
if priors_filename: args.extend(['-p', priors_filename])
if not quiet: sys.stderr.write(' '.join(args) + '\n')
if use_gdb: args = ['gdb', '-ex=run', '--args'] + args
subprocess.run(args, check=True)