src/videocls.py

# Example code for fine-tuning our audio-visual network to solve an
# action-recognition task.  We suggest rewriting this code, reusing
# only the parts that are relevant to your application.
import tfutil as tfu, aolib.util as ut, tensorflow as tf
import shift_net as shift
import tensorflow.contrib.slim as slim

ed = tf.expand_dims
shape = tfu.shape
add_n = tfu.maybe_add_n
pj = ut.pjoin
cast_float = tfu.cast_float
cast_int = tfu.cast_int

def make_net(ims, samples, pr, reuse = True, train = True):
  if pr.net_type == 'i3d':
    import i3d_kinetics
    keep_prob = 0.5 if train else 1.
    if pr.use_i3d_logits:
      with tf.variable_scope('RGB', reuse = reuse):
        net = tfu.normalize_ims(ims)
        i3d_net = i3d_kinetics.InceptionI3d(pr.num_classes, spatial_squeeze = True, final_endpoint = 'Logits')
        logits, _ = i3d_net(net, is_training = train, dropout_keep_prob = keep_prob)
        return ut.Struct(logits = logits, prob = tf.nn.softmax(logits), last_conv = logits)
    else:
      with tf.variable_scope('RGB', reuse = reuse):
        i3d_net = i3d_kinetics.InceptionI3d(pr.num_classes, final_endpoint = 'Mixed_5c')
        net = tfu.normalize_ims(ims)
        net, _ = i3d_net(net, is_training = train, dropout_keep_prob = keep_prob)
      last_conv = net
      net = tf.reduce_mean(last_conv, [1, 2, 3], keep_dims = True)
      with slim.arg_scope(shift.arg_scope(pr, reuse = reuse, train = train)):
        logits = shift.conv3d(
          net, pr.num_classes, [1, 1, 1], scope = 'lb/logits', 
          activation_fn = None, normalizer_fn = None)[:, 0, 0, 0, :]
        return ut.Struct(logits = logits, 
                         prob = tf.nn.softmax(logits), 
                         last_conv = net)

  elif pr.net_type == 'shift':
    with slim.arg_scope(shift.arg_scope(pr, reuse = reuse, train = train)):
      # To train the network without audio, you can set samples to be an all-zero array, and
      # set pr.use_sound = False.
      shift_net = shift.make_net(ims, samples, pr, reuse = reuse, train = train)
      if pr.use_dropout:
        shift_net.last_conv = slim.dropout(shift_net.last_conv, is_training = train)

      net = shift_net.last_conv
      net = tf.reduce_mean(net, [1, 2, 3], keep_dims = True)
      logits = shift.conv3d(
        net, pr.num_classes, [1, 1, 1], scope = 'lb/logits', 
        activation_fn = None, normalizer_fn = None)[:, 0, 0, 0, :]
      return ut.Struct(logits = logits, prob = tf.nn.softmax(logits), last_conv = net)
  elif pr.net_type == 'c3d':
    import c3d
    with slim.arg_scope(shift.arg_scope(reuse = reuse, train = train)):
      net = c3d.make_net(ims, samples, pr, reuse = reuse, train = train)
      net = net.last_conv
      net = tf.reduce_mean(net, [1, 2, 3], keep_dims = True)
      logits = c3d.conv3d(
        net, pr.num_classes, [1, 1, 1], scope = 'lb/logits', 
        activation_fn = None, normalizer_fn = None)[:, 0, 0, 0, :]
      return ut.Struct(logits = logits, prob = tf.nn.softmax(logits), last_conv = net)
  else: 
    raise RuntimeError()

def read_data(pr, gpus):
  """ This is the code for reading data. We suggest rewriting the I/O code for your own applications"""
  if pr.variable_frame_count:
    #import shift_dset
    import ucf_dset as shift_dset
  else:
    import cls_dset as shift_dset

  with tf.device('/cpu:0'):
    batch = ut.make_mod(pr.batch_size, len(gpus))
    ims, samples, labels = tfu.on_cpu(
      lambda : shift_dset.make_db_reader(
        pr.train_list, pr, batch, ['im', 'samples', 'label'],
        num_db_files = pr.num_dbs))
    
    inputs = {'ims' : ims, 'samples' : samples, 'label' : labels}
    splits = [{} for x in xrange(len(gpus))]      
    for k, v in inputs.items():
      if v is None:
        for i in xrange(len(gpus)):
          splits[i][k] = None
      else:
        s = tf.split(v, len(gpus))
        for i in xrange(len(gpus)):
          splits[i][k] = s[i]

    return splits

def num_samples(pr):
  return int(round(pr.samples_per_frame*pr.sampled_frames))

def label_loss(logits, labels, smooth = False):
  if smooth:
    nc = shape(logits, 1)
    oh = tf.one_hot(labels, nc)
    p = 0.05
    oh = p*(1./nc) + (1 - p) * oh
    loss = tf.nn.softmax_cross_entropy_with_logits(
      logits = logits, labels = oh)
  else:
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
      logits = logits, labels = labels)
  acc = tf.reduce_mean(tfu.cast_float(tf.equal(tf.argmax(logits, 1), labels)))
  acc = tf.stop_gradient(acc)
  acc.ignore = True
  loss = tf.reduce_mean(loss)
  return loss, acc

class Model:
  def __init__(self, pr, sess, gpus, is_training = True, profile = False):
    self.pr = pr
    self.sess = sess
    self.gpus = gpus
    self.default_gpu = gpus[0]
    self.is_training = is_training
    self.profile = profile

  def make_train_model(self):
    with tf.device(self.default_gpu):
      pr = self.pr
      # steps
      self.step = tf.get_variable(
        'global_step', [], trainable = False,
        initializer = tf.constant_initializer(0), dtype = tf.int64)
      self.lr = tf.constant(pr.base_lr)

      # model
      scale = pr.gamma ** tf.floor(cast_float(self.step) / float(pr.step_size))
      self.lr_step = pr.base_lr * scale
      #lr = tf.Print(lr, [lr, lr*1e3, scale])
      opt = shift.make_opt(pr.opt_method, self.lr_step, pr)
      self.inputs = read_data(pr, self.gpus)

      gpu_grads, gpu_losses = {}, {}
      for i, gpu in enumerate(self.gpus):
        with tf.device(gpu):
          reuse = (i > 0) 
          ims = self.inputs[i]['ims']
          samples = self.inputs[i]['samples']
          labels = self.inputs[i]['label']

          net = make_net(ims, samples, pr, reuse = reuse, train = self.is_training)
          self.loss = tfu.Loss('loss')
          self.loss.add_loss(shift.slim_losses_with_prefix(None), 'reg')
          self.loss.add_loss_acc(label_loss(net.logits, labels), 'label')
          grads = opt.compute_gradients(self.loss.total_loss())

          ut.add_dict_list(gpu_grads, self.loss.name, grads)
          ut.add_dict_list(gpu_losses, self.loss.name, self.loss)

          if i == 0:
            self.net = net
        
      (gs, vs) = zip(*tfu.average_grads(gpu_grads['loss']))
      if pr.grad_clip is not None:
        gs, _ = tf.clip_by_global_norm(gs, pr.grad_clip)
      gs = [tfu.print_every(gs[0], 100, ['grad norm:', tf.global_norm(gs)])] + list(gs[1:])
      gvs = zip(gs, vs)
      #for g, v in zip(grads, vs):
      # if g[0] is not None:
      #   tf.summary.scalar('%s_grad_norm' % v.name, tf.reduce_sum(g[0]**2)**0.5)
      #   tf.summary.scalar('%s_val_norm' % v.name, tf.reduce_sum(v**2)**0.5)
      #self.train_op = opt.apply_gradients(gvs, global_step = self.step)
      
      bn_ups = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      # self.train_op = tf.group(self.train_op, *bn_ups)

      with tf.control_dependencies(bn_ups):
        self.train_op = opt.apply_gradients(gvs, global_step = self.step)
      
      self.coord = tf.train.Coordinator()
      self.saver_fast = tf.train.Saver()
      self.saver_slow = tf.train.Saver(max_to_keep = 1000)

      #self.init_op = tf.global_variables_initializer()
      if self.is_training:
        self.init_op = tf.group(
          tf.global_variables_initializer(), 
          tf.local_variables_initializer())
        self.sess.run(self.init_op)

      tf.train.start_queue_runners(sess = self.sess, coord = self.coord)

      self.merged_summary = tf.summary.merge_all()
      print 'Tensorboard command:'
      summary_dir = ut.mkdir(pj(pr.summary_dir, ut.simple_timestamp()))
      print 'tensorboard --logdir=%s' % summary_dir
      self.sum_writer = tf.summary.FileWriter(summary_dir, self.sess.graph)

      if self.profile:
        self.profiler = tf.profiler.Profiler(self.sess.graph)

  def make_test_model(self):
    with tf.device(self.default_gpu):
      pr = self.pr
      print 'test variable frame count'
      if 0 and pr.variable_frame_count:
        self.test_ims_ph = tf.placeholder(tf.uint8, [1, None, pr.crop_im_dim, pr.crop_im_dim, 3])
        self.test_samples_ph = tf.placeholder(tf.float32, [1, None, 2])
      else:
        if hasattr(pr, 'resampled_frames'):
          self.test_ims_ph = tf.placeholder(tf.uint8, [1, pr.resampled_frames, pr.crop_im_dim, pr.crop_im_dim, 3])
        else:
          self.test_ims_ph = tf.placeholder(tf.uint8, [1, pr.sampled_frames, pr.crop_im_dim, pr.crop_im_dim, 3])
        self.test_samples_ph = tf.placeholder(tf.float32, [1, num_samples(pr), 2])

      assert not self.is_training 
      #self.is_training = True
      self.test_net = make_net(
        self.test_ims_ph, self.test_samples_ph, pr, 
        reuse = False, train = self.is_training)

  def checkpoint_fast(self):
    check_path = pj(ut.mkdir(self.pr.train_dir), 'net.tf')
    out = self.saver_fast.save(self.sess, check_path, global_step = self.step)
    print 'Checkpoint:', out

  def checkpoint_slow(self):
    check_path = pj(ut.mkdir(pj(self.pr.train_dir, 'slow')), 'net.tf')
    out = self.saver_slow.save(self.sess, check_path, global_step = self.step)
    print 'Checkpoint:', out

  #def restore(self, path = None, restore_opt = True, ul_only = False):
  def restore(self, path = None, restore_opt = True, ul_only = False):
    if path is None:
      path = tf.train.latest_checkpoint(self.pr.train_dir)      
    print 'Restoring:', path
    var_list = slim.get_variables_to_restore()
    for x in var_list:
      print x.name
    print
    var_list = slim.get_variables_to_restore()
    if not restore_opt:
      opt_names = ['Adam', 'beta1_power', 'beta2_power', 'Momentum'] + ['cls']# + ['renorm_mean_weight', 'renorm_stddev_weight', 'moving_mean', 'renorm']
      print 'removing bn gamma'
      opt_names += ['gamma']
      var_list = [x for x in var_list if not any(name in x.name for name in opt_names)]
    if ul_only:
      var_list = [x for x in var_list if not x.name.startswith('lb/') and ('global_step' not in x.name)]
    #var_list = [x for x in var_list if ('global_step' not in x.name)]
    print 'Restoring variables:'
    for x in var_list:
      print x.name
    tf.train.Saver(var_list).restore(self.sess, path)
    # print 'TEST: restoring all'
    # tf.train.Saver().restore(self.sess, path)

  def get_step(self):
    return self.sess.run([self.step, self.lr_step])

  def train(self):
    val_hist = {}
    pr = self.pr
    i = 0
    while True:
      step, lr = self.get_step()

      if i > 0 and step % pr.check_iters == 0:
        self.checkpoint_fast()
      if i > 0 and step % pr.slow_check_iters == 0:
        self.checkpoint_slow()

      if step >= pr.train_iters:
        break

      start = ut.now_sec()
      if step % 20 == 0:
        ret = self.sess.run([self.train_op, self.merged_summary] + self.loss.get_losses())
        self.sum_writer.add_summary(ret[1], step)
        loss_vals = ret[2:]
      else:
        loss_vals = self.sess.run([self.train_op] + self.loss.get_losses())[1:]
      ts = moving_avg('time', ut.now_sec() - start, val_hist)

      out = []
      for name, val in zip(self.loss.get_loss_names(), loss_vals):
        out.append('%s: %.3f' % (name, moving_avg(name, val, val_hist)))
      out = ' '.join(out)

      if step < 10 or step % pr.print_iters == 0:
        print 'Iteration %d, lr = %.0e, %s, time: %.3f' % (step, lr, out, ts)
      i += 1

def moving_avg(name, x, vals, avg_win_size = 100, p = 0.99):
  vals[name] = p*vals.get(name, x) + (1 - p)*x
  return vals[name]

def train(pr, gpus, restore = False, restore_opt = True, 
          num_gpus = None, profile = False):
  print pr
  gpus = tfu.set_gpus(gpus)
  with tf.Graph().as_default():
    config = tf.ConfigProto(allow_soft_placement = True)
    sess = tf.InteractiveSession(config = config)
    gpus = gpus[:num_gpus]
    model = Model(pr, sess, gpus, profile = profile)
    model.make_train_model()

    if restore:
      model.restore(restore_opt = restore_opt)
    elif pr.init_path is not None:
      init_ops = []
      if pr.net_type == 'i3d':
        opt_names = ['Adam', 'beta1_power', 'beta2_power', 'Momentum']
        rgb_variable_map = {}
        for variable in tf.global_variables():
          if any(x in variable.name for x in opt_names):
            print 'Skipping:', variable.name
            continue
          if pr.init_from_2d:
            if variable.name.split('/')[0] == 'RGB':
              # if 'moving_mean' in variable.name or 'moving_variance' in variable.name:
              #   continue
              cp_name = (
                variable.name
                .replace('RGB/inception_i3d', 'InceptionV1')
                .replace('Conv3d', 'Conv2d')
                .replace('batch_norm', 'BatchNorm')
                .replace('conv_3d/w', 'weights')
                .replace(':0', ''))
              print 'shape of', variable.name, shape(variable)
              v = tf.get_variable(cp_name, shape(variable)[1:], tf.float32)
              #rgb_variable_map[cp_name] = variable
              rgb_variable_map[cp_name] = v
              n = shape(v, 0)
              init_ops.append(variable.assign(1.0/float(n) * tf.tile(ed(v, 0), (n, 1, 1, 1, 1))))
          else:
            if variable.name.split('/')[0] == 'RGB':
              rgb_variable_map[variable.name.replace(':0', '')] = variable
        rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True)
        rgb_saver.restore(sess, pr.init_path)
        for x in init_ops:
          print 'Running:', x
          sess.run(x)
      else:
        print 'Restoring from init_path:', pr.init_path
        model.restore(pr.init_path, ul_only = True, restore_opt = False)

    tf.get_default_graph().finalize()

    model.train()

# Example parameters for UCF-101
# def shift_base(name, num_gpus):
#   total_dur = 10.
#   fps = 29.97
#   frame_dur = 1./fps
#   samp_sr = 21000.
#   pr = Params(train_iters = TrainIters,
#               gamma = 0.1,
#               step_size = StepSize,
#               subsample_frames = None,
#               cam = False,
#               base_lr = BaseLR,
#               opt_method = OptMethod, 
#               multipass = False,
#               momentum_rate = 0.9,
#               grad_clip = None,
#               batch_size = int(8*num_gpus),
#               val_batch = 1,
#               resdir = pj('../results/ucf-eval', name),
#               weight_decay = 1e-5,
#               train_list = pj(DataPath, 'ucf-tf-train-v5/tf'),
#               val_list = pj(DataPath, 'ucf-tf-train-v5/tf'),
#               test_list = '/data/efros/owens/ucf-test-files-1',
#               init_path = '../results/nets/shift/net.tf-650000',
#               use_sound = True,
#               im_type = 'jpeg',
#               input_type = 'samples',
#               full_im_dim = 256,
#               crop_im_dim = 224,
#               renorm = True,
#               checkpoint_iters = 1000,
#               dset_seed = None,
#               samp_sr = samp_sr,
#               fps = fps,
#               total_frames = int(total_dur*fps),
#               sampled_frames = int(VidDur*fps),
#               full_samples_len = int(total_dur * samp_sr),
#               samples_per_frame = samp_sr * frame_dur,
#               frame_sample_delta = int(total_dur*fps)/2,
  
#               max_intersection = -1,
#               batch_norm = True,
#               show_videos = False,
#               slow_check_iters = 1000,
#               check_iters = 500,
#               decompress_flow = True,
#               print_iters = 10,

#               fix_frame = False,
#               do_shift = False,
#               use_3d = True,
#               augment_ims = True,
#               augment_audio = True,
#               multi_shift = False,
#               num_dbs = None,
#               num_classes = 101,
#               add_top_block = False,
#               variable_frame_count = True,
#               net_type = 'shift',
#               test_size = 3783,
#               pool_frac = None,
#               bn_last = False,
#               im_split = True,
#               num_splits = 4,
#               use_dropout = False,
#               bn_scale = True,
#               )
#   pr.num_samples = int(pr.samples_per_frame * float(pr.sampled_frames))
#   return pr