-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_hinge_hamming_SVRG_BB.m
92 lines (76 loc) · 2.89 KB
/
train_hinge_hamming_SVRG_BB.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
function [W, obj] = train_hinge_hamming_SVRG_BB( X, Y, lambda_1, alpha )
% Optimize hamming loss with its surrogate loss, base loss function: hinge loss
% alpha: learning_rate
% lambda_1 for l2 norm
[num_instance, num_feature] = size(X);
num_label = size(Y, 2);
W = zeros(num_feature, num_label);
% Do serveral SGD steps first
for i = 1: 10
index = randi(num_instance);
GD_one = calculate_one_gradient(X(index,:), Y(index,:), W, lambda_1);
W = W - alpha * GD_one;
end
num_s = 30;
m = 2 * num_instance;
epsilon = 0;
for i = 1: num_s
W1 = W;
fG1 = calculate_all_gradient(X, Y, W1, lambda_1);
if i > 1
if i > 2 && abs(obj(i-1, 1) - obj(i-2, 1)) / obj(i-2, 1) <= epsilon
break;
end
alpha = norm(W1-W0, 'fro')^2 / trace((W1-W0)'*(fG1-fG0)) / m;
end
fG0 = fG1;
W0 = W1;
for j = 1: m
index = randi(num_instance);
GD_one = calculate_one_gradient(X(index,:), Y(index,:), W, lambda_1);
GD_ = calculate_one_gradient(X(index,:), Y(index,:), W1, lambda_1);
W = W - alpha * (GD_one - GD_ + fG1);
if isnan(W)
return;
end
end
obj(i,1) = calculate_objective_function(X, Y, W, lambda_1);
fprintf('Step %d: the objective function value is %.5f\n', i, obj(i,1));
end
end
function [f_value] = calculate_objective_function(X, Y, W, lambda_1)
f_value = 0.5 * lambda_1 * norm(W, 'fro')^2;
[num_instance, num_class] = size(Y);
I = ones(num_instance, num_class);
Z = zeros(num_instance, num_class);
temp1 = max(Z, I - Y .* (X * W));
f_value_point = sum(sum(temp1, 2));
f_value_point = f_value_point / num_class;
f_value = f_value + 1 / num_instance * f_value_point;
end
function [grad] = calculate_all_gradient(X, Y, W, lambda_1)
[num_instance, num_class] = size(Y);
num_feature = size(X, 2);
grad = lambda_1 * W;
Z_m = zeros(num_feature, num_class);
grad_point = Z_m;
I = ones(num_instance, num_class);
Z = zeros(num_instance, num_class);
grad_point = X' * (-Y .* sign(max(Z, I - Y .* (X * W))));
grad_point = grad_point / num_class;
grad = grad + grad_point / num_instance;
end
function [grad_one] = calculate_one_gradient(x, y, W, lambda_1)
% input: size(x) = [1, num_feature], size(y) = [1, num_class]
% Calculate hinge loss gradient
[num_feature, num_class] = size(W);
Z_m = zeros(num_feature, num_class);
grad_one = lambda_1 * W;
grad_point = Z_m;
I = ones(1, num_class);
Z = zeros(1, num_class);
grad_point = x' * (-y .* sign(max(Z, I - y .* (x * W))));
% add
grad_point = grad_point / num_class;
grad_one = grad_one + grad_point;
end