-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_multirc.py
309 lines (242 loc) · 12.3 KB
/
preprocess_multirc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import tensorflow as tf
import numpy as np
import pickle
import json
import re
from config import *
from sklearn.utils import shuffle
class Vocab:
def __init__(self, token2idx=None, idx2token=None):
self.token2idx = token2idx or dict()
self.idx2token = idx2token or dict()
def new_token(self, token):
# token : word or character
# return : index of token
if token not in self.token2idx:
index = len(self.token2idx)
self.token2idx[token] = index
self.idx2token[index] = token
return self.token2idx[token]
def get_token(self, index):
# index : position number of token
# return : word or character of index
return self.idx2token[index]
def get_index(self, token):
# token : word or character
# return : index of token
return self.token2idx[token]
def clean_str(text):
text = re.sub('<br>', '', text)
text = re.sub('</b>', '', text)
text = re.sub('<b>', ' ', text)
text = re.sub(':', ' :', text)
text = re.sub(',', ' ,', text)
text = re.sub('"', ' " ', text)
text = re.sub(r'\.', ' .', text)
text = re.sub(r'\?', ' ?', text)
text = re.sub(r'\-', ' - ', text)
text = re.sub(r'\(', '( ', text)
text = re.sub(r'\)', ' )', text)
text = re.sub(r'\;', ' ;', text)
text = re.sub('n\'t', ' not ', text) # heuristic for glove word table
text = re.sub(r'\[', '[ ', text)
text = re.sub(r'\]', ' ]', text)
text = re.sub(r'\!', ' !', text)
return text
# load pretrained GLOVE file
def make_glove():
word_vocab = dict()
start = time.time()
with open(FLAGS.glove_dir + FLAGS.glove_file, 'r', encoding='utf-8') as reader:
print ('Start Making Glove word dictionary')
for idx, line in enumerate(reader):
one_line = line.split()
try:
word = one_line[0]
embedding = np.array([float(val) for val in one_line[1:]])
word_vocab[word] = embedding
except ValueError as e:
if idx == 52343:
word = '. . .'
embedding = np.array([float(val) for val in one_line[3:]])
word_vocab[word] = embedding
elif idx == 151102:
word = '. . . . .'
embedding = np.array([float(val) for val in one_line[5:]])
word_vocab[word] = embedding
elif idx == 209833:
word = '. .'
embedding = np.array([float(val) for val in one_line[2:]])
word_vocab[word] = embedding
elif idx == 220779:
word = '. . . .'
embedding = np.array([float(val) for val in one_line[4:]])
word_vocab[word] = embedding
if idx in FLAGS.except2_idx:
one_line = line.split()
word = one_line[0] + ' ' + one_line[1]
embedding = np.array([float(val) for val in one_line[2:]])
word_vocab[word] = embedding
if (idx+1) % 1000000 == 0:
print ('elapsed time for making 100000 word', time.time() - start)
print ('Done Loading Glove word dictionary')
return word_vocab
def pickle_dump(glove_vocab=None):
if glove_vocab is None:
print ('Loading Glove pickle to dictionary')
with open(FLAGS.glove_dir + FLAGS.glove_load, 'rb') as f:
glove_vocab = pickle.load(f)
return glove_vocab
else:
print ('Start Writing dictionary to pickle')
with open(FLAGS.glove_dir + FLAGS.glove_load, 'wb') as f:
pickle.dump(glove_vocab, f, protocol=pickle.HIGHEST_PROTOCOL)
def multirc_data_load():
with open('./dataset/splitv2/train_456-fixedIds.json', 'r', encoding='utf-8') as reader:
train_data = json.load(reader)
with open('./dataset/splitv2/dev_83-fixedIds.json', 'r', encoding='utf-8') as reader:
dev_data = json.load(reader)
return train_data, dev_data
def make_data(train_data):
whole_word = dict()
whole_char = dict()
query_word = dict()
query_char = dict()
label_dict = dict()
label_word = dict()
word_maxlen = 0
ans_maxnum = 0
char_vocab = Vocab()
word_vocab = Vocab()
word_vocab.new_token('UNK')
for data_idx in range(len(train_data['data'])):
whole_word[data_idx] = list()
whole_char[data_idx] = list()
query_word[data_idx] = dict()
query_char[data_idx] = dict()
label_dict[data_idx] = dict()
label_word[data_idx] = dict()
# one text and many questions
# In one question, it has many answers
text = clean_str(train_data['data'][data_idx]['paragraph']['text'])
text = text.split('Sent')[1:]
questions = train_data['data'][data_idx]['paragraph']['questions']
# text : make word vocabulary dictionary and character vocabulary dictionary
for sent_idx in range(len(text)):
new_sent = text[sent_idx][5:].split()
for word in new_sent:
whole_word[data_idx].append(word_vocab.new_token(word))
whole_char[data_idx].append([char_vocab.new_token(c) for c in word])
if len(word) > word_maxlen:
word_maxlen = len(word)
# question : make question word vocabulary dictionary and character vocabulary dictionary
for ques_idx in range(len(questions)):
new_ques = clean_str(questions[ques_idx]['question'])
label_dict[data_idx][ques_idx] = dict()
label_word[data_idx][ques_idx] = dict()
query_word[data_idx][ques_idx] = list()
query_char[data_idx][ques_idx] = list()
for word in new_ques.split():
query_word[data_idx][ques_idx].append(word_vocab.new_token(word))
query_char[data_idx][ques_idx].append([char_vocab.new_token(c) for c in word])
if len(word) > word_maxlen:
word_maxlen = len(word)
# answer text and answer presence
new_ans = questions[ques_idx]['answers']
for ans_idx in range(len(new_ans)):
label_word[data_idx][ques_idx][ans_idx] = list()
label_dict[data_idx][ques_idx][ans_idx] = np.zeros((2), dtype=np.int32)
ans_text = clean_str(new_ans[ans_idx]['text'])
# answer presence
if new_ans[ans_idx]['isAnswer']:
label_dict[data_idx][ques_idx][ans_idx] = [1, 0]
else:
label_dict[data_idx][ques_idx][ans_idx] = [0, 1]
# answer text if character need then you need to add the code
for word in ans_text.split():
label_word[data_idx][ques_idx][ans_idx].append(word_vocab.new_token(word))
if ans_maxnum < len(new_ans):
ans_maxnum = len(new_ans)
return word_vocab, char_vocab, whole_word, whole_char, query_word, query_char, label_dict, label_word, word_maxlen, ans_maxnum
def glove_idx2vec(word_vocab, glove_vocab):
word_idx2vec = np.zeros((len(word_vocab.token2idx), 300), dtype=np.float32)
for word, idx in word_vocab.token2idx.items():
try:
word_idx2vec[idx, :] = glove_vocab[word]
except KeyError as e:
word_idx2vec[idx, :] = glove_vocab['unk']
return word_idx2vec
def embedding_matrix(whole_word, whole_char, query_word, query_char, label_word):
text_word = np.zeros((456, FLAGS.max_text), dtype=np.int32)
text_char = np.zeros((456, FLAGS.max_text, FLAGS.max_char), dtype=np.int32)
question_word = np.zeros((456, FLAGS.max_num_query, FLAGS.max_query_text), dtype=np.int32)
question_char = np.zeros((456, FLAGS.max_num_query, FLAGS.max_query_text, FLAGS.max_query_char), dtype=np.int32)
answer_word = np.zeros((456, FLAGS.max_num_query, FLAGS.max_num_answer, FLAGS.max_answer), dtype=np.int32)
for data_idx in range(len(whole_word)):
# paragraph text word array
text_word[data_idx, :len(whole_word[data_idx])] = whole_word[data_idx]
# paragraph text character array
for word_idx in range(len(whole_char[data_idx])):
text_char[data_idx, word_idx, :len(whole_char[data_idx][word_idx])] = whole_char[data_idx][word_idx]
# question text word array
for ques_idx in range(len(query_word[data_idx])):
question_word[data_idx, ques_idx, :len(query_word[data_idx][ques_idx])] = query_word[data_idx][ques_idx]
# question text character array
for ques_idx in range(len(query_char[data_idx])):
for word_idx in range(len(query_char[data_idx][ques_idx])):
question_char[data_idx, ques_idx, word_idx, :len(query_char[data_idx][ques_idx][word_idx])] = query_char[data_idx][ques_idx][word_idx]
# answer text word array
for ques_idx in range(len(label_word[data_idx])):
for ans_idx in range(len(label_word[data_idx][ques_idx])):
answer_word[data_idx, ques_idx, ans_idx, :len(label_word[data_idx][ques_idx][ans_idx])] = label_word[data_idx][ques_idx][ans_idx]
print ('Text paragraph word array size', text_word.shape)
print ('Text paragraph character array size', text_char.shape)
print ('Question word array size', question_word.shape)
print ('Question character array size', question_char.shape)
print ('Answer word array size', answer_word.shape)
return text_word, text_char, question_word, question_char, answer_word
def batch_loader(text_word, text_char, question_word, question_char, answer_word):
reduced_length = (len(text_word) // FLAGS.batch_size) * FLAGS.batch_size
text_word = text_word[:reduced_length]
text_char = text_char[:reduced_length]
question_word = question_word[:reduced_length]
question_char = question_char[:reduced_length]
answer_word = answer_word[:reduced_length]
text_word = np.reshape(text_word, [-1, FLAGS.batch_size, FLAGS.max_text])
text_char = np.reshape(text_char, [-1, FLAGS.batch_size, FLAGS.max_text, FLAGS.max_char])
question_word = np.reshape(question_word, [-1, FLAGS.batch_size, FLAGS.max_num_query, FLAGS.max_query_text])
question_char = np.reshape(question_char, [-1, FLAGS.batch_size, FLAGS.max_num_query, FLAGS.max_query_text, FLAGS.max_query_char])
answer_word = np.reshape(answer_word, [-1, FLAGS.batch_size, FLAGS.max_num_query, FLAGS.max_num_answer, FLAGS.max_answer])
return text_word, text_char, question_word, question_char, answer_word
def random_shuffle(text_word, text_char, question_word, question_char, answer_word):
text_word, text_char, question_word, question_char, answer_word = \
shuffle(text_word, text_char, question_word, question_char, answer_word)
return text_word, text_char, question_word, question_char, answer_word
def zip_file(text_word, text_char, question_word, question_char, answer_word):
text_word = list(text_word)
text_char = list(text_char)
question_word = list(question_word)
question_char = list(question_char)
answer_word = list(answer_word)
zip_list = list(zip(text_word, text_char, question_word, question_char, answer_word))
return zip_list
def load_data():
# make glove pickle file
# glove_vocab = make_glove()
glove_vocab = pickle_dump(glove_vocab=None)
# data load
train_data, dev_data = multirc_data_load()
# make data from original data
word_vocab, char_vocab, whole_word, whole_char, query_word, query_char,\
label_dict, label_word, word_maxlen, ans_maxnum = make_data(train_data)
# glove word2vec table
word_idx2vec = glove_idx2vec(word_vocab, glove_vocab)
# make list to array
text_word, text_char, question_word, question_char, answer_word = embedding_matrix(whole_word, whole_char, query_word, query_char, label_word)
# make as batch file
text_word, text_char, question_word, question_char, answer_word = batch_loader(text_word, text_char, question_word, question_char, answer_word)
# random shuffling
text_word, text_char, question_word, question_char, answer_word = random_shuffle(text_word, text_char, question_word, question_char, answer_word)
# zip all file to one sample
zip_list = zip_file(text_word, text_char, question_word, question_char, answer_word)
return zip_list, word_idx2vec