-
Notifications
You must be signed in to change notification settings - Fork 3
/
eval.py
307 lines (276 loc) · 12.1 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import os
import csv
import json
import random
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from PIL import Image
from torchvision import transforms
import rnn
'''
Evaluate the performance of the trained model under different settings:
+ 'eval': held-out object set
+ 'YCB': YCB object set
+ 'robot': objects in the robot's environment, captured by the robot's camera
'''
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='models/trained-model.pt',
help='saved checkpoint')
parser.add_argument('--input_test', type=str, default='data/corpus-test.csv',
help='test data')
#below file is generated by ycb.py
parser.add_argument('--input_embedding', type=str, default='data/ycb-object-embedding.csv',
help='ycb image embeddings')
parser.add_argument('--ycb_vo', type=str, default='data/ycb-verb-object.csv',
help='ycb verb-object pairs for testing')
parser.add_argument('--image_dir', type=str, default='robot',
help='directory for robot images')
parser.add_argument('--command', type=str, default='e.g. An object to contain',
help='natural language command for robot')
parser.add_argument('--num_layers', type=int, default=1,
help='number of layers of model')
parser.add_argument('--rnn_input', type=int, default=128, help='')
parser.add_argument('--hidden_dim', type=int, default=64, help='')
parser.add_argument('--rnn_output', type=int, default=2048, help='')
parser.add_argument('--dropout', type=float, default=0.0, help='')
parser.add_argument('--ret_num', type=int, default=5, help='')
parser.add_argument('--mode', type=str, default='robot', help='3 possible evaluation modes: YCB, robot, eval')
parser.add_argument('--DEBUG', type=bool, default=False, help='')
parser.add_argument('--verb_only', type=bool, default=True, help='')
opt = parser.parse_args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cuda = True if torch.cuda.is_available() else False
Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor
#load resnet model
resnet = models.resnet101(pretrained=True)
#access average pooling layer in network
model_avgpool = nn.Sequential(*list(resnet.children())[:-1])
model_avgpool.eval()
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
#Load word vocab (generated in run.py)
word2id = None
with open("dict/word2id.json") as f:
for line in f:
word2id = json.loads(line)
id2word = None
with open("dict/id2word.json") as f:
for line in f:
id2word = json.loads(line)
#Load trained model
model = nn.Sequential(
nn.Embedding(len(word2id), opt.rnn_input),
rnn.RNNModel(opt.rnn_input, opt.rnn_output, opt.hidden_dim, opt.num_layers,
opt.dropout, device)).to(device)
model.load_state_dict(torch.load(opt.model))
# Generate natural language command from templates, given verb-object pair
def gen_from_template(verb, obj):
pre_obj = ['Give me the ', 'Hand me the ', 'Pass me the ', 'Fetch the ',
'Get the ', 'Bring the ', 'Bring me the ',
'I need the ', 'I want the ',
'I need a ', 'I want a ']
pre_verb = ['An item that can ', 'An object that can ',
'Give me something that can ', 'Give me an item that can ',
'Hand me something with which I can ',
'Give me something with which I can ',
'Hand me something to ', 'Give me something to ',
'I want something to ', 'I need something to ']
if opt.verb_only:
template = random.choice(pre_verb)
sentence = template + verb
else:
template = random.choice(pre_obj)
sentence = template + obj + ' to ' + verb
return sentence
# Map each word in the natural language command to its ID in the vocab
def process_command(command, word2id, id2word):
sentence = []
s = command.lower().split()
for word in s:
if word in word2id:
sentence.append(word2id[word])
else:
sentence.append(word2id['UNK'])
return sentence
# Generate retrieval tasks (from verb-object pairs) to test the model
def gen_ret(vo_dict, objects, aff_dict, word2id, id2word, exclude):
ret_set = []
for verb in vo_dict:
for obj in vo_dict[verb]:
if obj not in exclude:
#generate language command from the verb-object pair
sentence = gen_from_template(verb, obj)
sentence = process_command(sentence, word2id, id2word)
l = [sentence]
ret_objs = [[obj]+random.choice(aff_dict[obj])]
all_o = [obj]
while len(ret_objs) < opt.ret_num:
o = random.choice(objects)
#only sample objects that cannot be paired with the current verb
#and ensure that the retrieval set has all unique objects
#(objects from different classes)
if (o not in vo_dict[verb]) and (o not in all_o) and (o not in exclude):
ret_objs.append([o]+random.choice(aff_dict[o]))
all_o.append(o)
l.append(ret_objs)
ret_set.append(l)
return ret_set
# Generate retrieval tasks (from test examples) to test the model
def genRet(test, vo_dict):
ret_set = []
for verb, obj, sentence, affordances, img in test:
l = [sentence]
#the object included in the current test example
#is the first candidate object for this retrieval task
ret_objs = [[obj, affordances, img]]
all_o = [obj]
#each retrieval task includes ret_num (5) candidate objects
#for the model to select from
while len(ret_objs) < opt.ret_num:
sample = random.choice(test)
#only sample objects that cannot be paired with the current verb
#and make sure that all objects in the retrieval set are unique
#(from different object classes)
if (sample[1] not in vo_dict[verb]) and (sample[1] not in all_o):
ret_objs.append([sample[1], sample[3], sample[4]])
all_o.append(sample[1])
l.append(ret_objs)
ret_set.append(l)
return ret_set
# Test the model on retrieval tasks (selecting the correct object from a set of 5)
def ret(model, ret_set, id2word):
model.eval()
correct, correct2 = 0.0, 0.0
with torch.no_grad():
for sentence, ret_objs in ret_set:
s = ''
for i in sentence:
s += id2word[str(i)] + ' '
sentence = Tensor(sentence).unsqueeze(0)
sims = []
output = model(sentence)
for obj_name, obj, img in ret_objs:
obj = np.fromstring(obj[1:-1], dtype=float, sep=',')
affordances = torch.from_numpy(
obj).to(device).float().unsqueeze(0)
sim = F.cosine_similarity(output, affordances)
sims.append(sim.item())
#rank each candidate object based on the similarity value between
#its embedding and the model's output embedding
#(we want the model's output to be the most similar to the
#correct object's embedding, as the model will select the object
#with the embedding most similar to its output)
sort = sorted(sims, reverse=True)
if sims[0] == sort[0]:
correct += 1
correct2 += 1
result = 'FIRST'
elif sims[0] == sort[1]:
correct2 += 1
result = 'SECOND'
else:
result = 'BOTH WRONG'
if opt.DEBUG:
print()
print(result)
l = []
for i, lt in enumerate(ret_objs):
obj_name, aff, img = lt
l.append([obj_name, sims[i], img])
top1, top2 = sims.index(sort[0]), sims.index(sort[1])
t1, t2 = ret_objs[top1][0], ret_objs[top2][0]
if opt.DEBUG:
print(s)
print(output)
print(l)
print(t1,',', t2)
print('RET_ACC Top1: {} Top2: {}'.format(
correct/len(ret_set), correct2/len(ret_set)))
if opt.mode == 'YCB': #evaluation on YCB dataset
aff_dict = {}
with open(opt.input_embedding, 'r') as f:
data = list(csv.reader(f))
for row in data:
obj = str(row[0]).lower()
aff = str(row[1])
img = str(row[2])
if obj not in aff_dict:
aff_dict[obj] = []
aff_dict[obj].append([aff, img])
vo_dict = {}
objects = []
with open(opt.ycb_vo, 'r') as f:
data = list(csv.reader(f))
for row in data:
verb = str(row[0]).lower()
obj = str(row[1]).lower()
if verb not in vo_dict:
vo_dict[verb] = []
if obj not in vo_dict[verb]:
vo_dict[verb].append(obj)
if obj not in objects:
objects.append(obj)
#exclude objects the model has already seen during training
exclude = ['banana', 'strawberry', 'orange', 'pitcher base', 'plate', 'phillips screwdriver', 'flat screwdriver', 'hammer', 'baseball', 'toy airplane']
#generate retrieval tasks (from the annotated verb-object pairs
#and object embeddings from the YCB object set) to test the model
ret_set = gen_ret(vo_dict, objects, aff_dict, word2id, id2word, exclude)
ret(model, ret_set, id2word)
elif opt.mode == 'robot': #robot demo
model.eval()
with torch.no_grad():
print(opt.command)
command = process_command(opt.command, word2id, id2word)
sentence = Tensor(command).unsqueeze(0)
predicted = model(sentence)
#use the pretrained resnet model to generate embeddings
#for the object images captured by the robot
embeddings = []
for f in os.listdir(opt.image_dir):
input_image = Image.open(os.path.join(opt.image_dir, f))
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
#move the input and model to GPU for speed if available
if torch.cuda.is_available():
input_batch = input_batch.to('cuda')
model_avgpool.to('cuda')
try:
output = model_avgpool(input_batch)
except:
print('Cannot encode image', os.path.join(opt.image_dir, f))
output = torch.flatten(output, 1)
embeddings.append([f, output])
sims = []
for _, em in embeddings:
sim = F.cosine_similarity(predicted, em)
sims.append(sim.item())
#rank each candidate object based on the similarity value between
#its embedding and the model's output embedding, the model
#will select the object with the embedding most similar to its output
sort = sorted(sims, reverse=True)
t1, t2, t3, t4, t5 = sims.index(sort[0]), sims.index(sort[1]), sims.index(sort[2]), sims.index(sort[3]), sims.index(sort[4])
top1, top2, top3, top4, top5 = embeddings[t1][0], embeddings[t2][0], embeddings[t3][0], embeddings[t4][0], embeddings[t5][0]
print(top1,',', top2,',', top3,',', top4,',', top5)
else: # evaluation on held-out test set
vo_dict = None
with open("data/vo_dict_verb.json") as f:
for line in f:
vo_dict = json.loads(line)
with open(opt.input_test, 'r') as test_file:
test_data = list(csv.reader(test_file))
test_dt = []
for row in test_data:
affordances = str(row[3])
sentence = process_command(row[2], word2id, id2word)
test_dt.append([row[0], row[1], sentence, affordances, row[4]])
#generate retrieval tasks (from the held-out test data) to test the model
ret_set = genRet(test_dt, vo_dict)
ret(model, ret_set, id2word)