/
newsgrp_dataset.py
311 lines (254 loc) · 11.2 KB
/
newsgrp_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import nltk
from sklearn.datasets import fetch_20newsgroups
import platform
import re
import os
import random
import tarfile
import urllib
import pickle
import numpy as np
import itertools
import torch
from torch.autograd import Variable
#from torchtext import data
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import os.path
class NewsGroup_DataSet:
def __init__(self, args):
"""Create an AutoGSR dataset instance. """
#self.word_embed_file = self.data_folder + 'embedding/wiki.ar.vec'
# word_embed_file = data_folder + "embedding/Wiki-CBOW"
self.data_dir = args.data_path + '20news/'
self.vocab_file = self.data_dir + 'vocabulary.txt'
self.train_df_file = self.data_dir + 'train_df.pkl'
self.test_df_file = self.data_dir + 'test_df.pkl'
self.lemmatizer = WordNetLemmatizer()
self.class_num = -1
pass
def clean_str(self, string):
"""
Tokenization/string cleaning.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
def get_vocab_size(self):
return self.vocab_size
def get_vocab_file(self):
return self.vocab_file
def get_class_num(self):
return self.class_num
def cal_class_num(self, df):
ids_labels = set()
# since sometimes the data will be shuffled in the frame
# during train test split
for index in df.index:
# labels
ids_labels.add(df.Class[index])
return len(ids_labels)
def load_vocab(self):
# if not os.path.isfile(self.vocab_size):
# # generate the vocab file
# newsgroups = fetch_20newsgroups(remove=('headers'))
#
# pass
with open(self.vocab_file) as f:
vocab_words = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
vocab_words = [x.strip() for x in vocab_words]
self.vocab_size = len(vocab_words)
vocab_wordidx = {w: i for i, w in enumerate(vocab_words)}
return vocab_wordidx
def sent_parse(self, text):
sentences = nltk.tokenize.sent_tokenize(text)
sentences_tokens = []
for sentence in sentences:
tokens = nltk.tokenize.wordpunct_tokenize(sentence)
processed_tokens = []
for token in tokens:
token = self.clean_str(token)
token = self.lemmatizer.lemmatize(token)
if not token:
processed_tokens.append(token)
sentences_tokens.append(tokens)
return sentences_tokens
def generate_data(self, val_ratio=.1, shuffle=True):
print("generating 20 newsgroup data ...")
train_df, test_df, vocab_wordidx = self.load_data_if_not_exist()
# if use_small:
# train_df = train_df[0:1000]
# test_df = test_df[0:500]
self.class_num = self.cal_class_num(train_df)
train_df, val_df = train_test_split(train_df,
test_size=val_ratio, random_state=967898,
stratify=train_df.Class)
x_train, y_train = self.format_data(train_df, vocab_wordidx)
x_val, y_val = self.format_data(val_df, vocab_wordidx)
x_test, y_test = self.format_data(test_df, vocab_wordidx)
return (x_train, y_train), (x_val, y_val), (x_test, y_test)
def load_data_if_not_exist(self):
"""Create dataset objects for splits of the MR dataset.
Arguments:
args: arguments
val_ratio: The ratio that will be used to get split validation dataset.
shuffle: Whether to shuffle the data before split.
"""
# check the existence of data files
if os.path.isfile(self.train_df_file) and os.path.isfile(self.test_df_file):
train_df = pd.read_pickle(self.train_df_file)
test_df = pd.read_pickle(self.test_df_file)
vocab_wordidx = self.load_vocab()
return train_df, test_df, vocab_wordidx
# load data
categories = ['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc'
]
#train_categories = categories[0:15]
train_categories = categories
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers'),
categories=train_categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers'))
# load vocab_words and vocab_wordidx
# with open(self.vocab_words_file, 'rb') as f:
# (vocab_words, vocab_idx) = pickle.load(f)
# self.vocab_size = len(vocab_words)
train_df = pd.DataFrame(
{'Text': [self.sent_parse(text) for text in newsgroups_train.data],
'Class': newsgroups_train.target}
)
test_df = pd.DataFrame(
{'Text': [self.sent_parse(text) for text in newsgroups_test.data],
'Class': newsgroups_test.target}
)
train_df.to_pickle(self.train_df_file)
test_df.to_pickle(self.test_df_file)
# extract title and sentence words
train_sent_words = train_df.Text.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
train_sent_words = list(itertools.chain.from_iterable(train_sent_words))
test_sent_words = test_df.Text.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
test_sent_words = list(itertools.chain.from_iterable(test_sent_words))
# generate vocabulary words
vocab_words = list(set(train_sent_words) | set(test_sent_words))
# add extra words such as start/end of sentence
vocab_words.append("<UNK>")
vocab_words.append("<SOSent>")
vocab_words.append("<EOSent>")
vocab_words.append("<SODoc>")
vocab_words.append("<EODoc>")
vocab_words.sort()
# save vocabulary words and word index into files
with open(self.vocab_file, 'w') as f:
for word in vocab_words:
f.write(word)
f.write('\n')
#pickle.dump(vocab_words, f, protocol=pickle.HIGHEST_PROTOCOL)
self.vocab_size = len(vocab_words)
vocab_wordidx = {w: i for i, w in enumerate(vocab_words)}
return train_df, test_df, vocab_wordidx
def format_data(self, data_frame, vocab_idx):
ids_document = []
ids_labels = []
# since sometimes the data will be shuffled in the frame
# during train test split
for index in data_frame.index:
document = data_frame.Text[index]
text_word_list = [self.convertSent2WordIds(sentence, vocab_idx) for sentence in
document]
#text_word_list = [j for j in i for i in text_word_list]
text_word_list = [item for sublist in text_word_list for item in sublist]
ids_document.append(text_word_list)
# labels
ids_labels.append(data_frame.Class[index])
return np.array(ids_document), np.array(ids_labels)
def convertSent2WordIds(self, sentence, vocab_idx):
"""
sentence is a list of word.
It is converted to list of ids based on vocab_idx
"""
sentence_start_tag_idx = vocab_idx["<SOSent>"]
sentence_end_tag_idx = vocab_idx["<EOSent>"]
word_unknown_tag_idx = vocab_idx["<UNK>"]
sent2id = [sentence_start_tag_idx]
#sent2id = []
try:
sent2id = sent2id + [
vocab_idx[word] if vocab_idx[word] < self.vocab_size else word_unknown_tag_idx for
word in sentence]
except KeyError as e:
print(e)
print(sentence)
raise ValueError('Fix this issue dude')
sent2id = sent2id + [sentence_end_tag_idx]
return sent2id
def iterate_minibatches(self, inputs, targets, batchsize, shuffle=False):
assert inputs.shape[0] == targets.shape[0]
if shuffle:
indices = np.arange(inputs.shape[0])
np.random.shuffle(indices)
for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
# return last part
start_idx = inputs.shape[0] - int(inputs.shape[0]) % batchsize
if shuffle:
excerpt = indices[start_idx:]
else:
excerpt = slice(start_idx, inputs.shape[0])
yield inputs[excerpt], targets[excerpt]
def gen_minibatch(self, tokens, labels, mini_batch_size, args, shuffle=True):
for token, label in self.iterate_minibatches(tokens, labels, mini_batch_size, shuffle=shuffle):
token = self.pad_batch(token)
token.data.t_()
label = Variable(torch.from_numpy(label), requires_grad=False)
if args.cuda == True:
token, label = token.cuda(), label.cuda()
yield (token, label)
def pad_batch(self, mini_batch):
mini_batch_size = len(mini_batch)
#mean_sent_len = int(np.mean([len(x) for x in mini_batch]))
mean_token_len = int(np.mean([len(x) for x in mini_batch]))
max_token_len = int(np.max([len(x) for x in mini_batch]))
main_matrix = np.zeros((mini_batch_size, mean_token_len), dtype=np.int)
for i in range(main_matrix.shape[0]):
for j in range(main_matrix.shape[1]):
try:
main_matrix[i, j] = mini_batch[i][j]
except IndexError:
pass
return Variable(torch.from_numpy(main_matrix).transpose(0, 1))