/
eval.py
108 lines (89 loc) · 4.25 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import sys
from operator import itemgetter
import sklearn
import sklearn.metrics
import torch
import torch.autograd as autograd
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
## general evaluation
def eval(dataset, x_val, y_val, model, args):
print("using normal evaluation ...")
model.eval()
corrects, avg_loss = 0, 0
#print(model)
y_pred = []
y_truth = []
logit_diff = []
logit_var = []
represent_all = torch.FloatTensor()
target_all = torch.LongTensor()
val_iter = dataset.gen_minibatch(x_val, y_val, args.batch_size, args, shuffle=True)
for batch in val_iter:
feature, target = batch[0], batch[1]
#feature.data.t_(), target.data.sub_(1) # batch first, index align
if args.cuda:
feature, target = feature.cuda(), target.cuda()
logit, represent = model(feature)
#loss = F.cross_entropy(logit, target, size_average=False)
#avg_loss += loss.data[0]
y_pred_cur = (torch.max(logit, 1)[1].view(target.size()).data).tolist()
y_truth_cur = target.data.tolist()
y_pred += y_pred_cur
y_truth += y_truth_cur
# corrects += (torch.max(logit, 1)
# [1].view(target.size()).data == target.data).sum()
logit_var_cur = np.var(logit.cpu().data.numpy(), axis=1).tolist()
logit_var += logit_var_cur
logit_diff_cur = (logit[:, 1] - logit[:, 0]).data.tolist()
logit_diff += logit_diff_cur
represent_all = torch.cat([represent_all, represent.data.cpu()], 0)
target_all = torch.cat([target_all, target.data.cpu()], 0)
# print("\n=== logit ===")
# print(logit)
# print("=== prediction ===")
# print(y_pred_cur)
# print("=== truth ===")
# print(y_truth_cur)
# apply idk ratio to filter out the uncertain instances.
if args.use_idk:
# logit_diff_abs = [abs(x) for x in logit_diff]
# indices, L_sorted = zip(*sorted(enumerate(logit_diff_abs), key=itemgetter(1), reverse=True))
indices, L_sorted = zip(*sorted(enumerate(logit_var), key=itemgetter(1), reverse=True))
idk_list = np.arange(0, 0.45, 0.05)
for idk_ratio in idk_list:
#print("=== idk_ratio: ", idk_ratio, " ===")
test_num = int(len(L_sorted) * (1 - idk_ratio))
indices_cur = list(indices[:test_num])
y_truth_cur = [y_truth[i] for i in indices_cur]
y_pred_cur = [y_pred[i] for i in indices_cur]
f1_score = show_results(dataset, y_truth_cur, y_pred_cur, represent_all, target_all)
else:
f1_score = show_results(dataset, y_truth, y_pred, represent_all, target_all)
return f1_score
def show_results(dataset, y_truth, y_pred, represent=None, target=None):
class_num = dataset.get_class_num()
if class_num == 2:
accuracy_score = (sklearn.metrics.accuracy_score(y_truth, y_pred))
f1_score = (sklearn.metrics.f1_score(y_truth, y_pred, pos_label=1))
prec_score = (sklearn.metrics.precision_score(y_truth, y_pred, pos_label=1))
recall_score = (sklearn.metrics.recall_score(y_truth, y_pred, pos_label=1))
confusion_mat = (sklearn.metrics.confusion_matrix(y_truth, y_pred))
print(accuracy_score, f1_score, prec_score, recall_score, sep='\t')
#print(confusion_mat)
return f1_score
elif class_num > 2:
accuracy_score = (sklearn.metrics.accuracy_score(y_truth, y_pred))
micro_f1_score = (sklearn.metrics.f1_score(y_truth, y_pred, average='micro'))
macro_f1_score = (sklearn.metrics.f1_score(y_truth, y_pred, average='macro'))
print(accuracy_score, micro_f1_score, macro_f1_score, sep='\t')
return micro_f1_score
# if represent is not None:
# loss_intra, loss_inter = eval_metric(represent, target)
# print(loss_intra, loss_inter)
# print('\nEvaluation - acc: {:.4f} f1: {:.4f} precision: {:.4f} recall: {:.4f}\n'.format(accuracy_score,
# f1_score,
# prec_score,
# recall_score))