/
baseline.py
364 lines (304 loc) · 16 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
#Please use python 3.5 or above
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import optimizers
from keras.models import load_model
import json, argparse, os
import re
import io
import sys
import regex
# Path to training and testing data file. This data can be downloaded from a link, details of which will be provided.
trainDataPath = ""
testDataPath = ""
# Path to directory where GloVe file is saved.
gloveDir = ""
NUM_FOLDS = None # Value of K in K-fold Cross Validation
NUM_CLASSES = None # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = None # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer
MAX_SEQUENCE_LENGTH = None # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = None # The dimension of the word embeddings
BATCH_SIZE = None # The batch size to be chosen for training the model.
LSTM_DIM = None # The dimension of the representations learnt by the LSTM model
DROPOUT = None # Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/
NUM_EPOCHS = None # Number of epochs to train a model for
PREPROCESS = False # Preprocess data?
label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}
def preprocessData(dataFilePath, mode, preprocess=False):
"""Load data from a file, process and return indices, conversations and labels in separate lists
Input:
dataFilePath : Path to train/test file to be processed
mode : "train" mode returns labels. "test" mode doesn't return labels.
Output:
indices : Unique conversation ID list
conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
labels : [Only available in "train" mode] List of labels
"""
indices = []
conversations = []
labels = []
with io.open(dataFilePath, encoding="utf8") as finput:
finput.readline()
for line in finput:
# Convert multiple instances of . ? ! , to single instance
# okay...sure -> okay . sure
# okay???sure -> okay ? sure
# Add whitespace around such punctuation
# okay!sure -> okay ! sure
repeatedChars = ['.', '?', '!', ',']
for c in repeatedChars:
lineSplit = line.split(c)
while True:
try:
lineSplit.remove('')
except:
break
cSpace = ' ' + c + ' '
line = cSpace.join(lineSplit)
line = line.strip().split('\t')
if mode == "train":
# Train data contains id, 3 turns and label
label = emotion2label[line[4]]
labels.append(label)
conv = ' <eos> '.join(line[1:4])
# Remove any duplicate spaces
duplicateSpacePattern = re.compile(r'\ +')
conv = re.sub(duplicateSpacePattern, ' ', conv)
if preprocess:
stray_punct = ['‑', '-', "^", ":",
";", "#", ")", "(", "*", "=", "\\", "/"]
for punct in stray_punct:
conv = conv.replace(punct, "")
if preprocess:
processedData = regex.cleanText(conv.lower(), remEmojis=1).lower()
processedData = processedData.replace("'", "")
# Remove numbers
processedData = ''.join([i for i in processedData if not i.isdigit()])
else:
processedData = conv.lower()
indices.append(int(line[0]))
conversations.append(processedData)
if mode == "train":
return indices, conversations, labels
else:
return indices, conversations
def getMetrics(predictions, ground):
"""Given predicted labels and the respective ground truth labels, display some metrics
Input: shape [# of samples, NUM_CLASSES]
predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
Output:
accuracy : Average accuracy
microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
microRecall : Recall calculated on a micro level
microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification
"""
# [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
discretePredictions = to_categorical(predictions.argmax(axis=1))
truePositives = np.sum(discretePredictions*ground, axis=0)
falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
print("True Positives per class : ", truePositives)
print("False Positives per class : ", falsePositives)
print("False Negatives per class : ", falseNegatives)
# ------------- Macro level calculation ---------------
macroPrecision = 0
macroRecall = 0
# We ignore the "Others" class during the calculation of Precision, Recall and F1
for c in range(1, NUM_CLASSES):
precision = truePositives[c] / (truePositives[c] + falsePositives[c])
macroPrecision += precision
recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
macroRecall += recall
f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
macroPrecision /= 3
macroRecall /= 3
macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))
# ------------- Micro level calculation ---------------
truePositives = truePositives[1:].sum()
falsePositives = falsePositives[1:].sum()
falseNegatives = falseNegatives[1:].sum()
print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
microPrecision = truePositives / (truePositives + falsePositives)
microRecall = truePositives / (truePositives + falseNegatives)
microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
# -----------------------------------------------------
predictions = predictions.argmax(axis=1)
ground = ground.argmax(axis=1)
accuracy = np.mean(predictions==ground)
print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
return accuracy, microPrecision, microRecall, microF1
def writeNormalisedData(dataFilePath, texts):
"""Write normalised data to a file
Input:
dataFilePath : Path to original train/test file that has been processed
texts : List containing the normalised 3 turn conversations, separated by the <eos> tag.
"""
normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
with io.open(dataFilePath, encoding='utf8') as fin:
fin.readline()
for lineNum, line in enumerate(fin):
line = line.strip().split('\t')
normalisedLine = texts[lineNum].strip().split('<eos>')
fout.write(line[0] + '\t')
# Write the original turn, followed by the normalised version of the same turn
fout.write(line[1] + '\t' + normalisedLine[0] + '\t')
fout.write(line[2] + '\t' + normalisedLine[1] + '\t')
fout.write(line[3] + '\t' + normalisedLine[2] + '\t')
try:
# If label information available (train time)
fout.write(line[4] + '\n')
except:
# If label information not available (test time)
fout.write('\n')
def getEmbeddingMatrix(wordIndex):
"""Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
Input:
wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
Output:
embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
"""
embeddingsIndex = {}
# Load the embedding vectors from ther GloVe file
with io.open(os.path.join(gloveDir, 'glove.6B.100d.txt'), encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
embeddingVector = np.asarray(values[1:], dtype='float32')
embeddingsIndex[word] = embeddingVector
print('Found %s word vectors.' % len(embeddingsIndex))
# Minimum word index of any word is 1.
embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
for word, i in wordIndex.items():
embeddingVector = embeddingsIndex.get(word)
if embeddingVector is not None:
# words not found in embedding index will be all-zeros.
embeddingMatrix[i] = embeddingVector
return embeddingMatrix
def buildModel(embeddingMatrix):
"""Constructs the architecture of the model
Input:
embeddingMatrix : The embedding matrix to be loaded in the embedding layer.
Output:
model : A basic LSTM model
"""
embeddingLayer = Embedding(embeddingMatrix.shape[0],
EMBEDDING_DIM,
weights=[embeddingMatrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
model = Sequential()
model.add(embeddingLayer)
model.add(LSTM(LSTM_DIM, dropout=DROPOUT))
model.add(Dense(NUM_CLASSES, activation='sigmoid'))
rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
model.compile(loss='categorical_crossentropy',
optimizer=rmsprop,
metrics=['acc'])
return model
def main():
parser = argparse.ArgumentParser(description="Baseline Script for SemEval")
parser.add_argument('-config', help='Config to read details', required=True)
args = parser.parse_args()
with open(args.config) as configfile:
config = json.load(configfile)
global trainDataPath, testDataPath, gloveDir
global NUM_FOLDS, NUM_CLASSES, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM
global BATCH_SIZE, LSTM_DIM, DROPOUT, NUM_EPOCHS, LEARNING_RATE, PREPROCESS
trainDataPath = config["train_data_path"]
testDataPath = config["test_data_path"]
gloveDir = config["glove_dir"]
NUM_FOLDS = config["num_folds"]
NUM_CLASSES = config["num_classes"]
MAX_NB_WORDS = config["max_nb_words"]
MAX_SEQUENCE_LENGTH = config["max_sequence_length"]
EMBEDDING_DIM = config["embedding_dim"]
BATCH_SIZE = config["batch_size"]
LSTM_DIM = config["lstm_dim"]
DROPOUT = config["dropout"]
LEARNING_RATE = config["learning_rate"]
NUM_EPOCHS = config["num_epochs"]
PREPROCESS = bool(config["preprocess"])
print("Processing training data...")
trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train", preprocess=PREPROCESS)
# Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable
# writeNormalisedData(trainDataPath, trainTexts)
print("Processing test data...")
testIndices, testTexts, testLabels = preprocessData(testDataPath, mode="train", preprocess=PREPROCESS)
# writeNormalisedData(testDataPath, testTexts)
print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(trainTexts)
trainSequences = tokenizer.texts_to_sequences(trainTexts)
testSequences = tokenizer.texts_to_sequences(testTexts)
wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))
print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex)
data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
testLabels = to_categorical(np.asarray(testLabels))
print("Shape of training data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)
# Randomize data
np.random.shuffle(trainIndices)
data = data[trainIndices]
labels = labels[trainIndices]
# Perform k-fold cross validation
metrics = {"accuracy" : [],
"microPrecision" : [],
"microRecall" : [],
"microF1" : []}
print("Starting k-fold cross validation...")
for k in range(NUM_FOLDS):
print('-'*40)
print("Fold %d/%d" % (k+1, NUM_FOLDS))
validationSize = int(len(data)/NUM_FOLDS)
index1 = validationSize * k
index2 = validationSize * (k+1)
xTrain = np.vstack((data[:index1],data[index2:]))
yTrain = np.vstack((labels[:index1],labels[index2:]))
xVal = data[index1:index2]
yVal = labels[index1:index2]
print("Building model...")
model = buildModel(embeddingMatrix)
model.fit(xTrain, yTrain,
validation_data=(xVal, yVal),
epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
predictions = model.predict(xVal, batch_size=BATCH_SIZE)
accuracy, microPrecision, microRecall, microF1 = getMetrics(predictions, yVal)
metrics["accuracy"].append(accuracy)
metrics["microPrecision"].append(microPrecision)
metrics["microRecall"].append(microRecall)
metrics["microF1"].append(microF1)
print("\n============= Metrics =================")
print("Average Cross-Validation Accuracy : %.4f" % (sum(metrics["accuracy"])/len(metrics["accuracy"])))
print("Average Cross-Validation Micro Precision : %.4f" % (sum(metrics["microPrecision"])/len(metrics["microPrecision"])))
print("Average Cross-Validation Micro Recall : %.4f" % (sum(metrics["microRecall"])/len(metrics["microRecall"])))
print("Average Cross-Validation Micro F1 : %.4f" % (sum(metrics["microF1"])/len(metrics["microF1"])))
print("\n======================================")
print("Retraining model on entire data to create solution file")
model = buildModel(embeddingMatrix)
model.fit(data, labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
model.save('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))
# model = load_model('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))
print("Creating solution file...")
testData = pad_sequences(testSequences, maxlen=MAX_SEQUENCE_LENGTH)
predictions = model.predict(testData, batch_size=BATCH_SIZE)
print("Results on test data...")
import utils
utils.getMetrics(predictions, testLabels)
print("Completed. Model parameters: ")
print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d"
% (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))
if __name__ == '__main__':
main()