/
debugme.py
50 lines (40 loc) · 1.38 KB
/
debugme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def vocab(f_path):
i2t = []
t2i = dict()
with open(f_path) as in_stream:
for l in in_stream:
for word in l.strip().split():
if word not in t2i:
t2i[word] = len(i2t)
i2t.append(word)
return t2i, i2t
def cooc(f_path, t2i):
cooc = [[0]*len(t2i)]*len(t2i)
with open(f_path) as in_stream:
for l in in_stream:
words = l.strip().split()
word_indices = [t2i[w] for w in words]
for w in word_indices:
cooc_w = cooc[w]
for other in word_indices:
cooc_w[other] += 1
return cooc
def arg_k_max(lst, k):
"""Renvoie les indices des k plus grands éléments de `lst`"""
res = []
for n, val in enumerate(lst):
if len(res) < k:
res.append((n, val))
res.sort(reverse=True, key=lambda x: x[1])
elif res[-1][1] < val:
res.pop()
res.append((n, val))
res.sort(reverse=True, key=lambda x: x[1])
return [i for i, _ in res]
def common_neighbours(word, t2i, i2t, cooc, k=10):
context = cooc[t2i[word]]
k_largest = arg_k_max(context, k)
return [i2t[index] for index in k_largest]
ancor_t2i, ancor_i2t = vocab("ancor.txt")
ancor_cooc = cooc("ancor.txt", ancor_t2i)
print(common_neighbours("moi", ancor_t2i, ancor_i2t, ancor_cooc))