-
Notifications
You must be signed in to change notification settings - Fork 6
/
bprop.m
100 lines (86 loc) · 3.81 KB
/
bprop.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
function [CE, gradient_weights, gradient_bias] = bprop(input_batch, ...
target_batch, states, weights)
% function [CE, word_embedding_weights_gradient, embed_to_hid_weights_gradient, ...
% hid_to_output_weights_gradient, hid_bias_gradient, output_bias_gradient] ...
% = bprop(input_batch, target_batch, embedding_layer_state, embed_to_hid_weights, ...
% hidden_layer_state, hid_to_output_weights, output_layer_state)
% Outputs:
% hid_to_output_weights_gradient (gradient_weights{end})
%
% output_bias_gradient (gradient_weights{end})
% get dimensions
ny = length(states);
nw = length(weights);
% initialize parameters
numlayers = length(states) + 1;
[V, mbsz] = size(states{ny});
numwords = size(input_batch, 1);
numhid1 = size(states{1}, 1);
numemb = numhid1 / numwords;
tiny = exp(-30);
% COMPUTE DERIVATIVE.
% Expand the target to a sparse 1-of-K vector.
expanded_target_batch = sparse(double(target_batch), 1:mbsz, 1, V, mbsz);
% Compute derivative of cross-entropy loss function.
% dC/dZ_output = y_output - t_output
error_deriv = states{ny} - expanded_target_batch;
% MEASURE LOSS FUNCTION.
% C = -sum_output [t_output*log(y_output)]
CE = full(-sum(sum(expanded_target_batch .* log(states{ny} + tiny)))) / mbsz;
gradient_weights = cell(numlayers-1, 1);
gradient_bias = cell(numlayers-2, 1);
back_prop_deriv = cell(numlayers-2, 1);
% BACK PROPAGATE.
%% OUTPUT TO HIDDEN.
% dC/dw_{hid-out} = y_hid * dC/dz_out [e.g. size 200 X 250]
gradient_weights{ny} = states{ny-1} * error_deriv';
% dC/db_out = dC/dz_out * dz_out/db_out = dC/dz_out [e.g. size 250 X 1]
gradient_bias{nw} = sum(error_deriv, 2);
% dC/dy_hid = sum_out[w_{hid-out} * dC/dz_out]
tmp1 = weights{nw} * error_deriv;
% dy_hid/dz_hid = y_hid * (1-y_hid), since hidden layer is logistic
tmp2 = states{ny-1} .* (1 - states{ny-1});
% dC/dz_hid = (dC/dy_hid).*(dy_hid/dz_hid)
back_prop_deriv{1} = tmp1 .* tmp2;
%% HIDDEN TO HIDDEN.
% perform when there are more than 1 hidden layer (the other three layers
% are word, embedding and output layers)
if numlayers > 4
for i = 2:numlayers-3
% dC/dw_{hid(lower)-hid(upper)} = y_hid(lower) * dC/dz_hid(upper)
gradient_weights{ny-i+1} = states{ny-i} * back_prop_deriv{i-1}';
% dC/db_hid(upper) = dC/dz_hid(upper) * dz_hid(upper)/db_hid(upper)
% = dC/dz_hid(upper)
gradient_bias{nw-i+1} = sum(back_prop_deriv{i-1}, 2);
% dC/dy_hid(lower) = sum_out[w_{hid(lower)-hid(upper)} * dC/dz_hid(upper)]
tmp1 = weights{nw-i+1} * back_prop_deriv{i-1};
% dy_hid(lower)/dz_hid(lower) = y_hid(lower) * (1-y_hid(lower)),
tmp2 = states{ny-i} .* (1 - states{ny-i});
% dC/dz_hid(lower) = (dC/dy_hid(lower)).*(dy_hid(lower)/dz_hid(lower))
back_prop_deriv{i} = tmp1 .* tmp2;
end
end
%% HIDDEN TO EMBEDDING.
% dC/dw_{emb-hid} = y_emb * dC/dz_hid
gradient_weights{2} = states{1} * back_prop_deriv{nw-1}';
% dC/db_hid = dC/dz_hid * dz_hid/db_hid = dC/dz_hid
gradient_bias{1} = sum(back_prop_deriv{nw-1}, 2);
% dC/dy_emb = sum_hid[w_{emb-hid} * dC/dz_hid]
tmp1 = weights{1} * back_prop_deriv{nw-1};
% dy_emb/dz_emb = 1, since embedding layer is linear
tmp2 = 1;
back_prop_deriv{nw} = tmp1 .* tmp2;
%% EMBEDDING TO WORD.
% similar to previous two weights gradient, but need to sum over the
% number of words - zge
gradient_weights{1} = zeros(V, numemb);
for w = 1:numwords
% obtain y_word
% expanded_input_batch = expansion_matrix(:, input_batch(w, :));
expanded_input_batch = sparse(double(input_batch(w, :)), 1:mbsz, 1, ...
V, mbsz);
% add dC/d_{word-emb} = y_word * dC/dy_emb word by word
gradient_weights{1} = gradient_weights{1} + expanded_input_batch * ...
(back_prop_deriv{nw}(1 + (w-1)*numemb : w*numemb, :)');
end
clear expanded_input_batch