-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
154 lines (124 loc) · 4.55 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from tqdm import tqdm
import numpy as np
import pyemblib
import array
import sys
import os
target_list_path = sys.argv[1]
#========1=========2=========3=========4=========5=========6=========7==
class Embeddings(dict):
'''Wrapper for word embeddings; inherits from Dictionary.
Keys are words, values are embedding arrays.
'''
@property
def size(self):
if not hasattr(self, '_size'):
for any_vector in self.values():
break
self._size = len(any_vector)
return self._size
@property
def dimension(self):
return self.size
@property
def shape(self):
return (len(self), self.size)
def has(self, key):
return not self.get(key, None) is None
def toarray(self, ordered=False):
'''Returns the embedding vocabulary in fixed order and
a NumPy array of the embeddings, in vocab order.
'''
if ordered:
vocab = list(self.keys())
vocab.sort()
vocab = tuple(vocab)
else:
vocab = tuple(self.keys())
embed_array = []
for v in vocab: embed_array.append(self[v])
return (vocab, numpy.array(embed_array))
#========1=========2=========3=========4=========5=========6=========7==
def _readBin(fname, size_only=False, first_n=None, separator=' ', replace_errors=False, filter_to=None, lower_keys=False, errors='strict'):
import sys
words, vectors = [], []
if filter_to:
if lower_keys:
filter_set = set([f.lower() for f in filter_to])
key_filter = lambda k: k.lower() in filter_set
else:
filter_set = set(filter_to)
key_filter = lambda k: k in filter_set
else:
key_filter = lambda k: True
inf = open(fname, 'rb')
# get summary info about vectors file
summary = inf.readline().decode('utf-8', errors=errors)
summary_chunks = [int(s.strip()) for s in summary.split(' ')]
(numWords, dim) = summary_chunks[:2]
if len(summary_chunks) > 2: float_size = 8
else: float_size = 4
if size_only:
return (numWords, dim)
bsep = separator.encode('utf-8')
#================================
problem_words = []
num_errors = 0
#================================
for _ in tqdm(range(numWords)):
word = []
while True:
next_ch = inf.read(1)
if next_ch == b' ':
break
elif next_ch != b'\n': # some files have \n separator and some do not
word.append(next_ch)
if replace_errors:
word = b''.join(word).decode('utf-8', errors='replace')
else:
word = b''.join(word).decode('utf-8', errors=errors)
vector = np.array(array.array('f', inf.read(dim*float_size)))
#================================
finite_array = np.isfinite(vector)
nan_array = np.isnan(vector)
if False in finite_array or True in nan_array:
problem_words.append(word)
num_errors += 1
#================================
elif key_filter(word):
words.append(word)
vectors.append(vector)
if (not first_n is None) and len(words) == first_n:
break
inf.close()
print("Number of errors: ", num_errors)
# verify that we read properly
if not first_n is None:
assert len(words) == first_n
elif not filter_to:
if len(words) != numWords:
sys.stderr.write("[WARNING] Expected %d words, read %d\n" % (numWords, len(words)))
return (words, vectors)
#========1=========2=========3=========4=========5=========6=========7==
def loopflow(target_list_path):
with open(target_list_path, encoding='utf-8', errors='ignore') as f:
target_list = f.readlines()
for i,target in enumerate(target_list):
target = os.path.abspath(target)
target = list(target)
target.remove('\n')
target = "".join(target)
basename = os.path.basename(target)
parent = os.path.abspath(os.path.join(target, '../'))
extension = target.split('.')[-1]
words, vectors = _readBin(target)
lower_keys = False
wordmap = Embeddings()
for i in range(len(words)):
if lower_keys: key = words[i].lower()
else: key = words[i]
wordmap[key] = vectors[i]
save_name = os.path.join(parent,'parse-error-fix_' + basename)
pyemblib.write(wordmap, save_name, mode=pyemblib.Mode.Binary)
if __name__ == '__main__':
loopflow(target_list_path)