Fork版本項目地址:SSDpython
做者使用了分佈式訓練的寫法,這使得訓練部分代碼異常臃腫,我給出了部分註釋。我對於多機分佈式並不很熟,並且不是重點,因此不過多介紹,簡單的給出一點訓練中做者的優化手段,包含優化器選擇之類的。git
# =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None
with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate(FLAGS, dataset.num_samples, global_step)
細節實現函數,有三種形式,一種是常數學習率,兩種不一樣的衰減方式(默認參數:exponential):github
def configure_learning_rate(flags, num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. """ decay_steps = int(num_samples_per_epoch / flags.batch_size * flags.num_epochs_per_decay) if flags.learning_rate_decay_type == 'exponential': return tf.train.exponential_decay(flags.learning_rate, global_step, decay_steps, flags.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif flags.learning_rate_decay_type == 'fixed': return tf.constant(flags.learning_rate, name='fixed_learning_rate') elif flags.learning_rate_decay_type == 'polynomial': return tf.train.polynomial_decay(flags.learning_rate, global_step, decay_steps, flags.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate')
optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate)
選擇很豐富(默認參數:adam):bash
def configure_optimizer(flags, learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. """ if flags.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate, rho=flags.adadelta_rho, epsilon=flags.opt_epsilon) elif flags.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=flags.adagrad_initial_accumulator_value) elif flags.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate, beta1=flags.adam_beta1, beta2=flags.adam_beta2, epsilon=flags.opt_epsilon) elif flags.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=flags.ftrl_learning_rate_power, initial_accumulator_value=flags.ftrl_initial_accumulator_value, l1_regularization_strength=flags.ftrl_l1, l2_regularization_strength=flags.ftrl_l2) elif flags.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=flags.momentum, name='Momentum') elif flags.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=flags.rmsprop_decay, momentum=flags.rmsprop_momentum, epsilon=flags.opt_epsilon) elif flags.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', flags.optimizer) return optimizer
實際上中間有好一段分佈式梯度計算過程,這裏很少介紹,大概就是在各個clone上計算出梯度,彙總梯度,再優化各個clone網絡,將優化節點提出做爲train_tensor等等。網絡
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), # 看函數實現就明白了,assign變量用 summary_op=summary_op, # tf.summary.merge節點 number_of_steps=FLAGS.max_number_of_steps, # 訓練step log_every_n_steps=FLAGS.log_every_n_steps, # 輸出訓練信息間隔 save_summaries_secs=FLAGS.save_summaries_secs, # 每次summary時間間隔 saver=saver, # tf.train.Saver節點 save_interval_secs=FLAGS.save_interval_secs, # 每次model保存step間隔 session_config=config, # sess參數 sync_optimizer=None)
其中調用的初始化函數以下:session
def get_init_fn(flags): """Returns a function run by the chief worker to warm-start the training. Note that the init_fn is only run when initializing the model during the very first global step. Returns: An init function run by the supervisor. """ if flags.checkpoint_path is None: return None # Warn the user if a checkpoint exists in the train_dir. Then ignore. if tf.train.latest_checkpoint(flags.train_dir): tf.logging.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % flags.train_dir) return None exclusions = [] if flags.checkpoint_exclude_scopes: exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')] # TODO(sguada) variables.filter_variables() variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore.append(var) # Change model scope if necessary. if flags.checkpoint_model_scope is not None: variables_to_restore = \ {var.op.name.replace(flags.model_name, flags.checkpoint_model_scope): var for var in variables_to_restore} if tf.gfile.IsDirectory(flags.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(flags.checkpoint_path) else: checkpoint_path = flags.checkpoint_path tf.logging.info('Fine-tuning from %s. Ignoring missing vars: %s' % (checkpoint_path, flags.ignore_missing_vars)) return slim.assign_from_checkpoint_fn( checkpoint_path, variables_to_restore, ignore_missing_vars=flags.ignore_missing_vars)
至此,SSD項目介紹完畢,訓練命令以下,不過默認訓練step是無限的,不手動終止會一直訓練下去,因此要關注一下訓練的指標,夠用了就關了吧,app
DATASET_DIR=./tfrecords TRAIN_DIR=./logs/ CHECKPOINT_PATH=./checkpoints/ssd_300_vgg.ckpt python train_ssd_network.py \ --train_dir=${TRAIN_DIR} \ --dataset_dir=${DATASET_DIR} \ --dataset_name=pascalvoc_2012 \ --dataset_split_name=train \ --model_name=ssd_300_vgg \ --checkpoint_path=${CHECKPOINT_PATH} \ --save_summaries_secs=60 \ --save_interval_secs=600 \ --weight_decay=0.0005 \ --optimizer=adam \ --learning_rate=0.001 \ --batch_size=32
如何使用訓練好模型見集智專欄的文章最後一部分。分佈式