/
train.py
506 lines (478 loc) · 21 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
#----------IMPORT PYTHON PACKAGES----------#
from tensorflow.keras.mixed_precision import experimental as mixed_precision
import horovod.tensorflow.keras as hvd
from tensorflow import keras
import tensorflow as tf
import cityscapesscripts.helpers.labels as labels
import numpy as np
import matplotlib.image as Image
import matplotlib.pyplot as plt
from datetime import datetime
import argparse
import sys
import os
#----------IMPORT MODELS AND UTILITIES----------#
import unet
import separable_unet
import bayes_segnet
import deeplabv3plus
import fast_scnn
import lrfinder
import datasets
#----------CONSTANTS----------#
CITYSCAPES_LABELS = [
label for label in labels.labels if -1 < label.trainId < 255]
# Add unlabeled
CITYSCAPES_LABELS.append(labels.labels[0])
# Get all the colors
CITYSCAPES_COLORS = [label.color for label in CITYSCAPES_LABELS]
# Get the IDS
CITYSCAPES_IDS = [label.trainId for label in CITYSCAPES_LABELS]
# Change unlabeled id from 255 to 19
CITYSCAPES_IDS[-1] = 19
CLASSES = 20
#----------ARGUMENTS----------#
parser = argparse.ArgumentParser(
prog='train.py', description="Start training a semantic segmentation model")
parser.add_argument(
"-m", "--model",
help="Specify the model you wish to use: OPTIONS: unet, separable_unet, bayes_segnet, deeplabv3+, fastscnn",
choices=['unet', 'bayes_segnet', 'deeplabv3+',
'fastscnn', 'separable_unet'],
required=True)
parser.add_argument(
"-r", "--resume",
help="Resume the training, specify the weights file path of the format weights-[epoch]-[val-acc]-[val-loss]-[datetime].hdf5")
parser.add_argument(
'-p', "--path",
help="Specify the root folder for cityscapes dataset, if not used looks for CITYSCAPES_DATASET environment variable")
parser.add_argument(
'-c', "--coarse",
help="Use the coarse images", action="store_true")
parser.add_argument(
'-t', "--target-size",
help="Set the image size for training, should be a elements of a tuple x,y,c")
parser.add_argument(
'-b', "--batch",
help="Set the batch size", default=8, type=int)
parser.add_argument(
'-e', "--epochs",
help="Set the number of Epochs", default=1000, type=int)
parser.add_argument(
'--mixed-precision',
help="Use Mixed Precision. WARNING: May cause memory leaks", action="store_true")
parser.add_argument(
'-f', '--lrfinder',
help='Use the Learning Rate Finder on a model to determine the best learning rate range for said optimizer',
choices=['adadelta', 'adagrad', 'adam', 'adamax', 'ftrl', 'nadam', 'rmsprop', 'sgd', 'sgd_nesterov'])
parser.add_argument(
'--schedule',
help="Set a Learning Rate Schedule, here either Polynomial Decay (polynomial) or Cyclic Learning Rate (cyclic) is Available",
choices=['polynomial', 'cyclic'])
parser.add_argument(
'--momentum',
help='Only useful for lrfinder, adjusts momentum of an optimizer, if there is that option',
type=float,
default=0.0)
parser.add_argument(
'-l', '--learning-rate',
help='Set the learning rate')
schedule_arg = parser.add_argument_group(
title='schedule', description='Arguments for the Learning Rate Scheduler')
schedule_arg.add_argument(
'--max-lr', help='The maximum learning rate during training', type=float)
schedule_arg.add_argument(
'--min-lr', help='The minimum learning rate during training', type=float)
schedule_arg.add_argument(
'--power', help='The power used in Polynomial Decay', type=float)
schedule_arg.add_argument(
'--cycle', help='The length of cycle used for Cyclic Learning Rates', type=int)
args = parser.parse_args()
# Get model_name
model_name = args.model
# Check the CITYSCAPES_ROOT path
if os.path.isdir(args.path):
CITYSCAPES_ROOT = args.path
elif 'CITYSCAPES_DATASET' in os.environ:
CITYSCAPES_ROOT = os.environ.get('CITYSCAPES_DATASET')
else:
parser.error("ERROR: No valid path for Cityscapes Dataset given")
# Now do the target size
if args.target_size is None:
target_size = (1024, 2048, 3)
else:
target_size = tuple(int(i) for i in args.target_size.split(','))
# Get the Batch Size
batch_size = args.batch
# Get the number of epochs
epochs = args.epochs
# Check if the resume has been used, if so check the path
weights_path = args.resume
initial_epoch = 0
if args.resume is not None and os.path.isfile(args.resume):
filename_split = os.path.basename(args.resume).split('-')
initial_epoch = filename_split[1]
time = filename_split[-1].replace('.hdf5', '')
elif args.resume is not None and not os.path.isfile(args.resume):
parser.error("ERROR: Weights File not found.")
if args.momentum < 0:
parser.error("ERROR: You specified a momentum value and it was below 0")
if args.schedule and (not args.max_lr or not args.min_lr):
parser.error(
"When using the scheduler, both the max and min learning rate must be set.")
if args.schedule == 'polynomial' and not args.power:
parser.error("The power must be specified for Polynomal Decay")
if args.schedule == 'cyclic' and (not args.cycle or args.cycle < 0):
parser.error(
"Either cycle must be set when using Cyclic Learning Rate or you provided a cycle less than 0.")
# Initialize horovod
hvd.init()
gpus = tf.config.list_physical_devices('GPU')
# Set Memory Growth to alleviate memory issues
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
tf.config.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
print("\nUsing {} GPUs via Horovod on rank {}\n".format(
tf.config.get_visible_devices('GPU'),
hvd.rank())
)
# Change policy to fp16 to use Tensor Cores
if args.mixed_precision:
policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
mixed_precision.set_policy(policy)
class csMeanIoU(keras.metrics.Metric):
"""
Very Similar to the keras Mean IoU
with some minor changes specific
to this workflow.
"""
def __init__(self, num_classes, name=None, dtype=None):
super(csMeanIoU, self).__init__(name=name, dtype=dtype)
self.num_classes = num_classes
self.cm = self.add_weight(
'confusion_matrix',
shape=(num_classes - 1, num_classes - 1),
initializer=tf.zeros_initializer(),
dtype=tf.float64,
)
def update_state(self, y_true, y_pred, sample_weight=None):
# Change y_true and y_pred to labels and flatten them
y_true = tf.argmax(y_true, -1)
y_pred = tf.argmax(y_pred, -1)
y_true = tf.reshape(y_true, [-1])
y_pred = tf.reshape(y_pred, [-1])
# Get the Confusion Matrix
curr_cm = tf.math.confusion_matrix(
y_true, y_pred, self.num_classes, dtype=tf.float64)
# Slice the Confusion Matrix to Ignore the Background
curr_cm = tf.slice(
curr_cm, (0, 0), (self.num_classes - 1, self.num_classes - 1))
# Assign the weights of this to the cm of the class
return self.cm.assign_add(curr_cm)
def result(self):
# Get True Positives, False Positives, False Negatives
tp = tf.linalg.tensor_diag_part(self.cm)
fp = tf.math.reduce_sum(self.cm, axis=0) - tp
fn = tf.math.reduce_sum(self.cm, axis=1) - tp
# Calculate the denominator
denom = tp + fp + fn
# Get the IoU of each class
ious = tf.math.divide_no_nan(tp, denom)
# Get the valid entries (Not 0)
num_valid_entries = tf.math.reduce_sum(
tf.cast(tf.math.not_equal(denom, 0), dtype=tf.float64))
# Return the Mean IoU of this image and Label
return tf.math.divide_no_nan(tf.math.reduce_sum(ious), num_valid_entries)
def reset_states(self):
keras.backend.set_value(self.cm, np.zeros(
(self.num_classes-1, self.num_classes-1)))
def get_config(self):
config = {'num_classes': self.num_classes}
base_config = super(csMeanIoU, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
#----------CREATE DATASETS----------#
train_ds = datasets.create_dataset(
CITYSCAPES_ROOT,
batch_size, epochs,
target_size,
classes=CLASSES,
shard_size=hvd.size(),
shard_rank=hvd.rank()
)
val_ds = datasets.create_dataset(
CITYSCAPES_ROOT,
batch_size, epochs,
target_size,
train=False,
classes=CLASSES,
shard_size=hvd.size(),
shard_rank=hvd.rank()
)
#----------CREATE MODEL AND BEGIN TRAINING----------#
time = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
# Depending on model chosen, get the right model and create the optimizer for it
if model_name == 'unet':
if args.learning_rate:
learning_rate = args.learning_rate*hvd.size()
else:
learning_rate = 0.01*hvd.size()
model = unet.model(input_size=target_size, num_classes=CLASSES)
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
if model_name == 'separable_unet':
if args.learning_rate:
learning_rate = args.learning_rate*hvd.size()
else:
learning_rate = 0.003*hvd.size()
model = separable_unet.model(input_size=target_size, num_classes=CLASSES)
optimizer = keras.optimizers.SGD(
learning_rate=learning_rate, momentum=0.9, nesterov=True)
elif model_name == 'bayes_segnet':
if args.learning_rate:
learning_rate = args.learning_rate*hvd.size()
else:
learning_rate = 0.001*hvd.size()
model = bayes_segnet.model(num_classes=CLASSES, input_size=target_size)
optimizer = keras.optimizers.SGD(momentum=0.9, learning_rate=learning_rate)
elif model_name == 'fastscnn':
if args.learning_rate:
learning_rate = args.learning_rate*hvd.size()
else:
learning_rate = 0.045*hvd.size()
model = fast_scnn.model(input_size=target_size, num_classes=CLASSES)
optimizer = keras.optimizers.SGD(momentum=0.9, learning_rate=learning_rate)
elif model_name == 'deeplabv3+':
if args.learning_rate:
learning_rate = args.learning_rate*hvd.size()
else:
learning_rate = 0.007*hvd.size()
model = deeplabv3plus.model(
input_size=target_size, num_classes=CLASSES, depthwise=True, backbone='xception')
optimizer = keras.optimizers.SGD(
learning_rate=learning_rate)
# We are in LRFINDER mode
if args.lrfinder is not None:
# Set a new optimizer for the LRFINDER
if args.lrfinder == 'sgd_nesterov':
optimizer = keras.optimizers.SGD(learning_rate=1e-6, nesterov=True)
else:
optimizer = keras.optimizers.get(args.lrfinder)
optimizer.learning_rate = 1e-6
if 'momentum' in optimizer.get_config():
optimizer.momentum = args.momentum
# Create distributed optimizer
optimizer = hvd.DistributedOptimizer(optimizer)
# Compile the model
model.compile(
loss='categorical_crossentropy',
optimizer=optimizer,
metrics=[keras.metrics.CategoricalAccuracy(name='accuracy')],
experimental_run_tf_function=False,
)
if hvd.rank() == 0:
print("Finding Optimum Learning Rate Range")
# Set the config for the lrfinder
max_lr = 1e1
epochs = 5
num_steps = np.ceil(len(datasets.get_cityscapes_files(
CITYSCAPES_ROOT, 'leftImg8bit', 'train', 'leftImg8bit')) / batch_size)
num_steps = np.ceil(num_steps / hvd.size())
# Create the learning rate finder
lrf = lrfinder.LearningRateFinder(model)
callbacks = []
verbose = 0
# Show the progress bar from only one worker
if hvd.rank() == 0:
verbose = 1
# Use the callbacks for horovod necessary for metrics and variables
callbacks = [
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
hvd.callbacks.MetricAverageCallback(),
]
# Start the Finder
lrf.find(train_ds, 1e-6, max_lr, batch_size,
num_steps, epochs, callbacks=callbacks, verbose=verbose)
# Since we are using horovod, we only use the first worker to produce the plot
if hvd.rank() == 0:
# Create title for image, which will also be the name of the image file
title = model_name + '-' + args.lrfinder
if 'momentum' in optimizer.get_config():
title = title + '-' + str(args.momentum)
print("The Learning Rate Range has been found")
# Create the path for the image file
path = os.path.join('.', 'logs', model_name,
'lrfinder_results', title + '.png')
if not os.path.isdir(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
print("Plot the Loss")
# Plot the loss
lrf.plot_loss(title=title)
# Save the figure
plt.savefig(path, dpi=300, format='png')
print("Please Look Here: {} for your results".format(path))
# Else we are training a model
else:
optimizer = hvd.DistributedOptimizer(optimizer)
# Whatever the model is, compile it
model.compile(
loss='categorical_crossentropy',
optimizer=optimizer,
metrics=[
keras.metrics.CategoricalAccuracy(name='accuracy'),
csMeanIoU(num_classes=CLASSES, name="meanIoU")
],
experimental_run_tf_function=False
)
# Ensure directories are made for callbacks
logs_dir = os.path.join('.', 'logs', model_name, time)
csv_dir = os.path.join(logs_dir, 'csv')
images_dir = os.path.join(logs_dir, 'images')
if hvd.rank() == 0:
if not os.path.isdir(logs_dir):
os.makedirs(logs_dir)
if not os.path.isdir(csv_dir):
os.mkdir(csv_dir)
if not os.path.isdir(images_dir):
os.mkdir(images_dir)
# Custom Callback to save a prediction every epoch
class prediction_on_epoch(keras.callbacks.Callback):
def __init__(self, target_size):
self.target_size = target_size
def on_epoch_end(self, epoch, logs=None):
image_to_predict = CITYSCAPES_ROOT + \
'/leftImg8bit/val/frankfurt/frankfurt_000000_003025_leftImg8bit.png'
# Get the byte strings for the files
image_to_predict = tf.io.read_file(image_to_predict)
# Decode the byte strings as png images
image_to_predict = tf.image.decode_png(
image_to_predict, channels=3)
# Convert the image to float32 and scale all values between 0 and 1
image_to_predict = tf.image.convert_image_dtype(
image_to_predict, tf.float32)
# Resize the image to target_size
image_to_predict = tf.image.resize(
image_to_predict, (self.target_size[0], self.target_size[1]), method='nearest')
# Reshape it ready for prediction as a single batch!
image_to_predict = tf.expand_dims(image_to_predict, 0)
# Do a prediction
prediction = model.predict(image_to_predict)
# Reshape prediction
prediction = tf.squeeze(prediction)
# Now do matrix multiply
image_to_save = np.matmul(prediction, CITYSCAPES_COLORS)
# Save the Image
Image.imsave(images_dir+'/epoch_{}.png'.format(epoch),
image_to_save.astype(np.uint8), dpi=300)
class polyDecay(keras.callbacks.Callback):
def __init__(self, min_lr, max_lr, steps, power, rank, warmup_epochs=3):
self.max_lr = max_lr
self.min_lr = min_lr
self.steps = steps
self.power = power
self.rank = rank
self.warmup_epochs = warmup_epochs
def on_epoch_begin(self, epoch, logs=None):
# If the optimizer has no learning_rate attribute we can't use the callback
if not hasattr(self.model.optimizer, 'learning_rate'):
raise ValueError(
'Optimizer must have a "learning_rate" attribute.')
if epoch > self.warmup_epochs:
# Get the previous learning rate
previous = tf.identity(self.model.optimizer.learning_rate)
# Work out the decay
decay = (1 - epoch / self.steps) ** (self.power)
# Change the current learning rate according to the decay and minimum and maximum learning rates
self.model.optimizer.learning_rate = (
(self.max_lr - self.min_lr) * decay) + self.min_lr
# If its the first worker, print the new learning rate
if self.rank == 0:
print("Learning Rate has changed from {} to {}".format(
previous.numpy(), self.model.optimizer.learning_rate.numpy()))
class cyclicLR(keras.callbacks.Callback):
def __init__(self, min_lr, max_lr, cycle, rank, warmup_epochs=3):
self.min_lr = min_lr
self.max_lr = max_lr
self.cycle = cycle
self.step_size = (max_lr - min_lr)/cycle
self.round_to = len(str(self.step_size))
self.direction = 1
self.rank = rank
self.warmup_epochs = warmup_epochs
def on_epoch_begin(self, epoch, logs=None):
# If the optimizer has no learning_rate attribute we can't use the callback
if not hasattr(self.model.optimizer, 'learning_rate'):
raise ValueError(
'Optimizer must have a "learning_rate" attribute.')
if epoch > self.warmup_epochs:
# Get the previous learning rate
previous = tf.identity(self.model.optimizer.learning_rate)
# Get the new learning rate based on the direction we're going and the step size
self.model.optimizer.learning_rate = np.round(
previous + self.step_size*self.direction, decimals=self.round_to)
# Change direction based on whether we are going towards or away from the minimum or maximum learning rate
if self.model.optimizer.learning_rate == self.min_lr or self.model.optimizer.learning_rate == self.max_lr:
self.direction *= -1
# If its the first worker, print the new learning rate
if self.rank == 0:
print("Learning Rate has changed from {} to {}".format(
previous.numpy(), self.model.optimizer.learning_rate.numpy()))
# The callbacks logs almost everything, and saves the best weights as the model trains, in theory last weights is best
callbacks = [
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
# Horovod: average metrics among workers at the end of every epoch.
#
# Note: This callback must be in the list before the ReduceLROnPlateau,
# TensorBoard or other metrics-based callbacks.
hvd.callbacks.MetricAverageCallback(),
# Horovod: using `lr = x * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = x` ---> `lr = x * hvd.size()` during
# the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1),
]
# If we are using a Learning Rate Schedule then use the appropriate callbacks
if args.schedule == 'polynomial':
model.optimizer.learning_rate = args.max_lr*hvd.size()
schedule = polyDecay(args.min_lr*hvd.size(), args.max_lr*hvd.size(),
epochs // hvd.size(), args.power, hvd.rank())
callbacks.append(schedule)
elif args.schedule == 'cyclic':
model.optimizer.learning_rate = args.min_lr*hvd.size()
schedule = cyclicLR(args.min_lr*hvd.size(),
args.max_lr*hvd.size(), args.cycle, hvd.rank())
callbacks.append(schedule)
# If its the chief worker, add the logging callbacks
if hvd.rank() == 0:
callbacks = callbacks + [
keras.callbacks.CSVLogger(csv_dir+'/log.csv'),
keras.callbacks.TensorBoard(log_dir=logs_dir, histogram_freq=1),
keras.callbacks.ModelCheckpoint(
logs_dir+'/weights-{epoch:02d}-{val_accuracy:.2f}-{val_loss:.2f}-'+time+'.hdf5', 'val_loss', mode='min', save_best_only=True),
prediction_on_epoch(target_size)
]
# Set the number of steps per epoch
training_steps = np.ceil(len(datasets.get_cityscapes_files(
CITYSCAPES_ROOT, 'leftImg8bit', 'train', 'leftImg8bit')) / batch_size)
validation_steps = np.ceil(len(datasets.get_cityscapes_files(
CITYSCAPES_ROOT, 'leftImg8bit', 'val', 'leftImg8bit')) / batch_size)
verbose = 0
if hvd.rank() == 0:
verbose = 1
training_steps = np.ceil(training_steps / hvd.size())
validation_steps = np.ceil(validation_steps / hvd.size())
# If we need to resume the training
if args.resume is not None:
model.load_weights(args.resume)
# Train!
history = model.fit(
train_ds,
epochs=epochs // hvd.size(),
steps_per_epoch=training_steps,
validation_data=val_ds,
validation_steps=validation_steps,
callbacks=callbacks,
verbose=verbose,
initial_epoch=initial_epoch
)