-
Notifications
You must be signed in to change notification settings - Fork 6
/
main.py
139 lines (103 loc) · 4.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# coding: utf-8
# # Training a classifier from InferSent embeddings to SNLI
#
# This is how the embeddings were trained end-to-end as well, but we just try with the given embeddings and SNLI training set.
# In[1]:
# %load_ext autoreload
# %autoreload 2
from random import randint
import matplotlib
import numpy as np
import torch
import nltk
nltk.download("punkt")
import torch
import pickle
import classifier as cl
import regrFuncs as rF
import testFuncs as tF
import loadData as lD
import os
# ***************************************************************************
# Defining PATHS
# ***************************************************************************
toy = True
# toy SNLI provided, download the rest
GLOVE_PATH = './Downloads/glove.840B.300d.txt'
MODEL_PATH = './Downloads/infersent.allnli.pickle'
REGR_MODEL_PATH = './models/'
# If not None, where you want SNLI embeddings to be stored (WARNING: high memory)
EMBED_STORE = None
TEST_OUT_PATH = './regout/'
if toy:
DATA_PATH = './Downloads/SNLI/toy/'
REGR_MODEL_PATH = REGR_MODEL_PATH + 'TOY'
EMBED_STORE = None
TEST_OUT_PATH = TEST_OUT_PATH + 'TOY'
TEST_DATA_PATH = './testData/toy/'
else:
DATA_PATH = './Downloads/SNLI/true/'
TEST_DATA_PATH = './testData/true/'
outpaths = {'REGR_MODEL_PATH': REGR_MODEL_PATH, 'TEST_OUT_PATH': TEST_OUT_PATH, 'TEST_DATA_PATH' : TEST_DATA_PATH}
id2label = {0:'CONTRADICTION', 1:'NEUTRAL', 2:'ENTAILMENT'}
label2id = {'CONTRADICTION': 0, 'NEUTRAL':1, 'ENTAILMENT':2}
# ***************************************************************************
# Defining the model
# ***************************************************************************
model = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
model.use_cuda = False
model.set_glove_path(GLOVE_PATH)
if toy:
model.build_vocab_k_words(K=100)
else:
model.build_vocab_k_words(K=100000)
batch_size = 64
useCudaReg = False
# ***************************************************************************
# Loading training data
# ***************************************************************************
snli_data = lD.loadSNLI(DATA_PATH, label2id)
#tasks = ['adjr', 'comp', 'ncon', 'subjv', 'temp', 'verb']
tasks = [f[7:] for f in os.listdir(outpaths['TEST_DATA_PATH']) if 'label' in f]
# Load scramble data in same format as snli (suitable for training)
scramble_data_path = './testData/toy/' if toy else './testData/true/'
scramble_data = lD.load_scramble_all(scramble_data_path, label2id, tasks)
# Create combined dataset of SNLI + scramble data
combined_data = lD.sort_group(lD.merge_groups([scramble_data, snli_data]))
# Select which data to train on:
training_data = snli_data # combined_data, scramble_data
# ***************************************************************************
# Helper functions for train and test
# ***************************************************************************
def allClassifiersExist(name, classifiers):
flag = True
for classifier in classifiers:
flag *= os.path.exists(REGR_MODEL_PATH + name + classifier)
return flag
def runAllTests(names, classifiers, model, tasks, outpaths, label2id = None):
for name in names:
for classifier in classifiers:
tF.runtests(name, classifier, model, tasks, outpaths, label2id)
# ***************************************************************************
# Training classifiers
# ***************************************************************************
names = ['InferSent','BOW']
classifiers = ['LogReg', 'MLP']
for name in names:
if (not allClassifiersExist(name, classifiers)):
embeddings = rF.create_embed(model, training_data,
batch_size, name, EMBED_STORE)
for classifier in classifiers:
if (not os.path.exists(REGR_MODEL_PATH + name + classifier)):
rF.trainreg(embeddings, training_data,
classifier, name, outpaths, useCudaReg)
# ***************************************************************************
# Testing classifiers on Scramble dataset
# ***************************************************************************
#print("Running tests for tasks: ", tasks)
#runAllTests(names, classifiers, model, tasks, outpaths, label2id)
# Retest the trained regressions
tasks = ['test', 'dev']
outpaths['TEST_DATA_PATH'] = DATA_PATH
print("Running tests for tasks: ", tasks)
runAllTests(names, classifiers, model, tasks, outpaths, label2id)