/
gru_ops.py
239 lines (187 loc) · 7.55 KB
/
gru_ops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Python wrapper for the Block GRU Op."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.contrib.rnn.ops import gen_gru_ops
from tensorflow.contrib.util import loader
from tensorflow.python.framework import ops
from tensorflow.python.layers import base as base_layer
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.platform import resource_loader
from tensorflow.python.util.deprecation import deprecated_args
_gru_ops_so = loader.load_op_library(
resource_loader.get_path_to_datafile("_gru_ops.so"))
LayerRNNCell = rnn_cell_impl.LayerRNNCell # pylint: disable=invalid-name
@ops.RegisterGradient("GRUBlockCell")
def _GRUBlockCellGrad(op, *grad):
r"""Gradient for GRUBlockCell.
Args:
op: Op for which the gradient is defined.
*grad: Gradients of the optimization function wrt output
for the Op.
Returns:
d_x: Gradients wrt to x
d_h: Gradients wrt to h
d_w_ru: Gradients wrt to w_ru
d_w_c: Gradients wrt to w_c
d_b_ru: Gradients wrt to b_ru
d_b_c: Gradients wrt to b_c
Mathematics behind the Gradients below:
```
d_c_bar = d_h \circ (1-u) \circ (1-c \circ c)
d_u_bar = d_h \circ (h-c) \circ u \circ (1-u)
d_r_bar_u_bar = [d_r_bar d_u_bar]
[d_x_component_1 d_h_prev_component_1] = d_r_bar_u_bar * w_ru^T
[d_x_component_2 d_h_prevr] = d_c_bar * w_c^T
d_x = d_x_component_1 + d_x_component_2
d_h_prev = d_h_prev_component_1 + d_h_prevr \circ r + u
```
Below calculation is performed in the python wrapper for the Gradients
(not in the gradient kernel.)
```
d_w_ru = x_h_prevr^T * d_c_bar
d_w_c = x_h_prev^T * d_r_bar_u_bar
d_b_ru = sum of d_r_bar_u_bar along axis = 0
d_b_c = sum of d_c_bar along axis = 0
```
"""
x, h_prev, w_ru, w_c, b_ru, b_c = op.inputs
r, u, c, _ = op.outputs
_, _, _, d_h = grad
d_x, d_h_prev, d_c_bar, d_r_bar_u_bar = gen_gru_ops.gru_block_cell_grad(
x, h_prev, w_ru, w_c, b_ru, b_c, r, u, c, d_h)
x_h_prev = array_ops.concat([x, h_prev], 1)
d_w_ru = math_ops.matmul(x_h_prev, d_r_bar_u_bar, transpose_a=True)
d_b_ru = nn_ops.bias_add_grad(d_r_bar_u_bar)
x_h_prevr = array_ops.concat([x, h_prev * r], 1)
d_w_c = math_ops.matmul(x_h_prevr, d_c_bar, transpose_a=True)
d_b_c = nn_ops.bias_add_grad(d_c_bar)
return d_x, d_h_prev, d_w_ru, d_w_c, d_b_ru, d_b_c
class GRUBlockCell(LayerRNNCell):
r"""Block GRU cell implementation.
Deprecated: use GRUBlockCellV2 instead.
The implementation is based on: http://arxiv.org/abs/1406.1078
Computes the GRU cell forward propagation for 1 time step.
This kernel op implements the following mathematical equations:
Biases are initialized with:
* `b_ru` - constant_initializer(1.0)
* `b_c` - constant_initializer(0.0)
```
x_h_prev = [x, h_prev]
[r_bar u_bar] = x_h_prev * w_ru + b_ru
r = sigmoid(r_bar)
u = sigmoid(u_bar)
h_prevr = h_prev \circ r
x_h_prevr = [x h_prevr]
c_bar = x_h_prevr * w_c + b_c
c = tanh(c_bar)
h = (1-u) \circ c + u \circ h_prev
```
"""
@deprecated_args(None, "cell_size is deprecated, use num_units instead",
"cell_size")
def __init__(self,
num_units=None,
cell_size=None,
reuse=None,
name="gru_cell"):
"""Initialize the Block GRU cell.
Args:
num_units: int, The number of units in the GRU cell.
cell_size: int, The old (deprecated) name for `num_units`.
reuse: (optional) boolean describing whether to reuse variables in an
existing scope. If not `True`, and the existing scope already has the
given variables, an error is raised.
name: String, the name of the layer. Layers with the same name will
share weights, but to avoid mistakes we require reuse=True in such
cases. By default this is "lstm_cell", for variable-name compatibility
with `tf.nn.rnn_cell.GRUCell`.
Raises:
ValueError: if both cell_size and num_units are not None;
or both are None.
"""
super(GRUBlockCell, self).__init__(_reuse=reuse, name=name)
if (cell_size is None) == (num_units is None):
raise ValueError(
"Exactly one of num_units or cell_size must be provided.")
if num_units is None:
num_units = cell_size
self._cell_size = num_units
# Inputs must be 2-dimensional.
self.input_spec = base_layer.InputSpec(ndim=2)
@property
def state_size(self):
return self._cell_size
@property
def output_size(self):
return self._cell_size
def build(self, input_shape):
# Check if the input size exist.
input_size = input_shape[1].value
if input_size is None:
raise ValueError("Expecting input_size to be set.")
self._gate_kernel = self.add_variable(
"w_ru", [input_size + self._cell_size, self._cell_size * 2])
self._gate_bias = self.add_variable(
"b_ru", [self._cell_size * 2],
initializer=init_ops.constant_initializer(1.0))
self._candidate_kernel = self.add_variable(
"w_c", [input_size + self._cell_size, self._cell_size])
self._candidate_bias = self.add_variable(
"b_c", [self._cell_size],
initializer=init_ops.constant_initializer(0.0))
self.built = True
def call(self, inputs, h_prev):
"""GRU cell."""
# Check cell_size == state_size from h_prev.
cell_size = h_prev.get_shape().with_rank(2)[1]
if cell_size != self._cell_size:
raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
(self._cell_size, cell_size))
_gru_block_cell = gen_gru_ops.gru_block_cell # pylint: disable=invalid-name
_, _, _, new_h = _gru_block_cell(
x=inputs,
h_prev=h_prev,
w_ru=self._gate_kernel,
w_c=self._candidate_kernel,
b_ru=self._gate_bias,
b_c=self._candidate_bias)
return new_h, new_h
class GRUBlockCellV2(GRUBlockCell):
"""Temporary GRUBlockCell impl with a different variable naming scheme.
Only differs from GRUBlockCell by variable names.
"""
def build(self, input_shape):
"""GRU cell."""
input_size = input_shape[1].value
if input_size is None:
raise ValueError("Expecting input_size to be set.")
self._gate_kernel = self.add_variable(
"gates/kernel", [input_size + self._cell_size, self._cell_size * 2])
self._gate_bias = self.add_variable(
"gates/bias", [self._cell_size * 2],
initializer=init_ops.constant_initializer(1.0))
self._candidate_kernel = self.add_variable(
"candidate/kernel", [input_size + self._cell_size, self._cell_size])
self._candidate_bias = self.add_variable(
"candidate/bias", [self._cell_size],
initializer=init_ops.constant_initializer(0.0))
self.built = True