-
Notifications
You must be signed in to change notification settings - Fork 8
/
model.py
239 lines (204 loc) · 10.2 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import tensorflow as tf
# from helper_functions import *
from resnet_model import ResNet, get_block_sizes
from tensorflow.contrib.slim.nets import resnet_v2
import numpy as np
def frontend_3D(x_input, training=True):
BATCH_SIZE, NUM_FRAMES, HEIGHT, WIDTH, NUM_CHANNELS = x_input.get_shape().as_list()
NUM_CLASSES = 500
# 3D CONVOLUTION
# First Convolution Layer - Image input, use 64 3D filters of size 5x5x5
# shape of weights is (dx, dy, dz, #in_filters, #out_filters)
n = np.prod([5, 7, 7, 64]) # std for weight initialization
W_conv1 = tf.get_variable("W_conv1", initializer=tf.truncated_normal(shape=[5, 7, 7, 1, 64], stddev=np.sqrt(2/n)))
b_conv1 = tf.get_variable("b_conv1", initializer=tf.constant(0.1, shape=[64]))
# apply first convolution
z_conv1 = tf.nn.conv3d(x_input, W_conv1, strides=[1, 1, 2, 2, 1], padding='SAME') + b_conv1
# apply batch normalization
z_conv1_bn = tf.contrib.layers.batch_norm(z_conv1,
data_format='NHWC', # Matching the "cnn" tensor shape
center=True,
scale=True,
is_training=training,
scope='cnn3d-batch_norm')
# apply relu activation
h_conv1 = tf.nn.relu(z_conv1_bn)
print("shape after 1st convolution is %s" % h_conv1.get_shape)
# apply max pooling
h_pool1 = tf.nn.max_pool3d(h_conv1,
strides=[1, 1, 2, 2, 1],
ksize=[1, 3, 3, 3, 1],
padding='SAME')
print(h_pool1.get_shape)
print("shape after 1st pooling is %s" % h_pool1.get_shape)
return h_pool1
def backend_resnet34(x_input):
BATCH_SIZE, NUM_FRAMES, HEIGHT, WIDTH, NUM_CHANNELS = x_input.get_shape().as_list()
# RESNET
video_input = tf.reshape(x_input, (BATCH_SIZE * NUM_FRAMES, HEIGHT, WIDTH, NUM_CHANNELS))
# = tf.cast(video_input, tf.float32)
resnet_size = 34
resnet = ResNet(resnet_size=resnet_size, bottleneck=False, num_classes=512, num_filters=64,
kernel_size=7, conv_stride=2, first_pool_size=3, first_pool_stride=2,
second_pool_size=7, second_pool_stride=1, block_sizes=get_block_sizes(resnet_size),
block_strides=[1, 2, 2, 2], final_size=512)
features = resnet.__call__(video_input, True)
# features, end_points = resnet_v2.resnet_v1_50(video_input, None)
features = tf.reshape(features, (BATCH_SIZE, NUM_FRAMES, int(features.get_shape()[1])))
print("shape after resnet is %s" % features.get_shape())
return features
def backend_resnet50_v2_slim(x_input):
BATCH_SIZE, NUM_FRAMES, HEIGHT, WIDTH, NUM_CHANNELS = x_input.get_shape().as_list()
# RESNET
video_input = tf.reshape(x_input, (BATCH_SIZE * NUM_FRAMES, HEIGHT, WIDTH, NUM_CHANNELS))
features, end_points = resnet_v2.resnet_v2_50(video_input, num_classes=512)
# features, end_points = resnet_v2.resnet_v1_50(video_input, None)
features = tf.reshape(features, (BATCH_SIZE, NUM_FRAMES, int(features.get_shape()[3])))
print("shape after resnet is %s" % features.get_shape())
return features
def concat_resnet_output(resnet_out):
# add resnet outputs per frame. This vector estimates resnets prediction on the word spoken
return tf.reduce_mean(resnet_out, axis=1)
def serialize_resnet_output(resnet_out):
BATCH_SIZE, NUM_FRAMES, NUM_CLASSES = resnet_out.get_shape()
return tf.reshape(resnet_out, (BATCH_SIZE, NUM_FRAMES*NUM_CLASSES))
def fully_connected_logits(x_input, out_size):
# fully connected layer (512 -> 500)
# predictions = tf.layers.dropout(inputs=x_input, rate=0.5)
predictions = tf.layers.dense(inputs=x_input, units=out_size, activation=tf.nn.relu)
# No Softmax here as this will be accounted for by the loss function
# predictions = tf.nn.softmax(predictions)
print("shape of predictions is %s" % predictions.get_shape())
return predictions
def blstm_2layer(x_input):
# 2-layer BiLSTM
# dense_out = tf.concat(dense_out, axis=1)
# Define input for forward and backward LSTM
dense_out_forw = x_input #tf.squeeze(dense_out, axis=2)
dense_out_back = tf.reverse(dense_out_forw, axis=[1])
# create 2 layer LSTMCells
# rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [256, 256]]
rnn_layers = [tf.contrib.rnn.LayerNormBasicLSTMCell (size) for size in [256, 256]]
# create a RNN cell composed sequentially of a number of RNNCells
multi_rnn_cell_forw = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
multi_rnn_cell_back = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
# 'outputs' is a tensor of shape [batch_size, max_time, 256]
# 'state' is a N-tuple where N is the number of LSTMCells containing a
# tf.contrib.rnn.LSTMStateTuple for each cell
outputs_forw, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell_forw,
inputs=dense_out_forw,
dtype=tf.float32)
outputs_back, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell_back,
inputs=dense_out_back,
dtype=tf.float32)
# get only the last output from lstm
# lstm_out = tf.transpose(lstm_out, [1, 0, 2])
last_forw = tf.gather(outputs_forw, indices=int(outputs_forw.get_shape()[1]) - 1, axis=1)
last_back = tf.gather(outputs_back, indices=int(outputs_forw.get_shape()[1]) - 1, axis=1)
bilstm_out = tf.concat([last_forw, last_back], axis=1)
print("shape of bilstm output is %s" % bilstm_out.get_shape())
return bilstm_out
def model_step1(x_input):
"""
Includes:
- spatiotemporal CNN front end
- 34 layer resnet
- fully connected layer (without softmax)
:param x_input: input tensor
:return:
"""
x = frontend_3D(x_input, training=False)
x = backend_resnet34(x)
x = concat_resnet_output(x)
x = fully_connected_logits(x, 500)
return x
# def get_model(x, training=True):
#
#
# # 3D CONVOLUTION
# # First Convolution Layer - Image input, use 64 3D filters of size 5x5x5
# # shape of weights is (dx, dy, dz, #in_filters, #out_filters)
# W_conv1 = tf.get_variable("W_conv1", initializer=tf.truncated_normal(shape=[5, 7, 7, 1, 64], stddev=0.1))
# b_conv1 = tf.get_variable("b_conv1", initializer=tf.constant(0.1, shape=[64]))
# # apply first convolution
# z_conv1 = tf.nn.conv3d(x, W_conv1, strides=[1, 1, 2, 2, 1], padding='SAME') + b_conv1
# # apply batch normalization
# z_conv1_bn = tf.contrib.layers.batch_norm(z_conv1,
# data_format='NHWC', # Matching the "cnn" tensor shape
# center=True,
# scale=True,
# is_training=training,
# scope='cnn3d-batch_norm')
# # apply relu activation
# h_conv1 = tf.nn.relu(z_conv1_bn)
# print("shape after 1st convolution is %s" % h_conv1.get_shape)
#
# # apply max pooling
# h_pool1 = tf.nn.max_pool3d(h_conv1,
# strides=[1, 1, 2, 2, 1],
# ksize=[1, 3, 3, 3, 1],
# padding='SAME')
# print(h_pool1.get_shape)
# print("shape after 1st pooling is %s" % h_pool1.get_shape)
#
#
# # Split the 3d convolution out to feed it to resnet
# resnet_input_series = tf.unstack(h_pool1, axis=1)
# print("number of elements in resnet_input_series is %s" % len(resnet_input_series))
# print("shape of resnet_input_series element is %s" % resnet_input_series[0].get_shape())
#
#
# # RESNET
# # need to change that to 34 layer model
# num_frames = len(resnet_input_series)
# scopes = ["res_frame%d"%i for i in range(num_frames)]
# # res_out = []
# dense_out = [] # performs dimensionality reduction to the output of resnet (512 -> 256)
# for i, resnet_input in enumerate(resnet_input_series[:2]): # REMOVE [:2]
# with tf.variable_scope(scopes[i]):
# features, end_points = resnet_v2.resnet_v2_50(resnet_input, num_classes=512)
# # res_out.append(features)
# dense_out.append(tf.layers.dense(inputs=features, units=256, activation=tf.nn.relu))
#
# print("shape after resnet is %s" % features.get_shape())
#
# # add resnet outputs per frame. This vector estimates resnets prediction on the word spoken
# # result = tf.add_n(res_out)
# # predictions = tf.nn.softmax(result)
# # print("shape of predictions is %s" % predictions.get_shape())
#
# # 2-layer BiLSTM
# dense_out = tf.concat(dense_out, axis=1)
# # Define input for forward and backward LSTM
# dense_out_forw = tf.squeeze(dense_out, axis=2)
# dense_out_back = tf.reverse(dense_out_forw, axis=[1])
# # create 2 layer LSTMCells
# rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [256, 256]]
#
# # create a RNN cell composed sequentially of a number of RNNCells
# multi_rnn_cell_forw = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
# multi_rnn_cell_back = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
#
# # 'outputs' is a tensor of shape [batch_size, max_time, 256]
# # 'state' is a N-tuple where N is the number of LSTMCells containing a
# # tf.contrib.rnn.LSTMStateTuple for each cell
# outputs_forw, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell_forw,
# inputs=dense_out_forw,
# dtype=tf.float32)
# outputs_back, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell_back,
# inputs=dense_out_back,
# dtype=tf.float32)
#
# # get only the last output from lstm
# # lstm_out = tf.transpose(lstm_out, [1, 0, 2])
# last_forw = tf.gather(outputs_forw, indices=int(outputs_forw.get_shape()[1])-1, axis=1)
# last_back = tf.gather(outputs_back, indices=int(outputs_forw.get_shape()[1])-1, axis=1)
#
# bilstm_out = tf.concat([last_forw, last_back], axis=1)
# print("shape of bilstm output is %s" % bilstm_out.get_shape())
#
# linear_out = tf.layers.dense(inputs=bilstm_out, units=500, activation=tf.nn.relu)
#
# pred = tf.nn.softmax(linear_out)
#
# return pred