Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
evaluator = FeedForwardEvaluator(preprocessor, eval_func)
print("Building evaluation graph")
top1_op, top5_op, enqueue_ops = evaluator.evaluation_step(batch_size)
else:
nstep_per_epoch = nrecord // (batch_size * hvd.size())
trainer = FeedForwardTrainer(preprocessor, loss_func, nstep_per_epoch)
print_r0("Building training graph")
total_loss, learning_rate, train_ops = trainer.training_step(
batch_size)
print_r0("Creating session")
config = tf.ConfigProto()
config.intra_op_parallelism_threads = 1
config.inter_op_parallelism_threads = 10
config.gpu_options.force_gpu_compatible = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
sess = tf.Session(config=config)
train_writer = None
saver = None
summary_ops = None
if hvd.rank() == 0 and len(FLAGS.log_dir):
log_dir = FLAGS.log_dir
train_writer = tf.summary.FileWriter(log_dir, sess.graph)
summary_ops = tf.summary.merge_all()
last_summary_time = time.time()
saver = tf.train.Saver(keep_checkpoint_every_n_hours=3)
last_save_time = time.time()
if not FLAGS.eval:
print_r0("Initializing variables")
def main(unused_argv):
tf.logging.set_verbosity(tf.logging.INFO)
hvd.init()
flags.mark_flag_as_required('model_dir')
flags.mark_flag_as_required('pipeline_config_path')
session_config = tf.ConfigProto()
session_config.gpu_options.per_process_gpu_memory_fraction=0.9
session_config.gpu_options.visible_device_list = str(hvd.local_rank())
if FLAGS.allow_xla:
session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config)
train_and_eval_dict = model_lib.create_estimator_and_inputs(
run_config=config,
eval_count=FLAGS.eval_count,
hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
pipeline_config_path=FLAGS.pipeline_config_path,
train_steps=FLAGS.num_train_steps,
sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
sample_1_of_n_eval_on_train_examples=(
FLAGS.sample_1_of_n_eval_on_train_examples))
estimator = train_and_eval_dict['estimator']
train_input_fn = train_and_eval_dict['train_input_fn']
def setup_horovod_execution(self):
"""
Sets up Horovod.
"""
# Check again to avoid import if unset which will crash if horovod is not installed.
if get_distributed_backend() == "horovod":
import horovod.tensorflow as hvd
self.logger.info("Setting up Horovod execution.")
hvd.init()
config = tf.ConfigProto()
config.gpu_options.visible_device_list = str(hvd.local_rank())
if config.profile_config.profile_worker != None and worker_id != config.profile_config.profile_worker:
#Only one CUPTI profiler can run in a machine
#See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452
config.profile_config.profile_dir = None
else:
config.profile_config.profile_dir = \
os.path.join(config.profile_config.profile_dir, hostname,
'worker:%d'%worker_id, 'run_meta')
ckpt_hooks = build_ckpt_hooks(config.get_ckpt_config()) if worker_id == 0 else None
sess_config = config.sess_config
if sess_config is None:
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
sess = tf.train.MonitoredTrainingSession(
is_chief=True,
checkpoint_dir=config.get_ckpt_config().ckpt_dir if worker_id == 0 else None,
# TODO: Allow user-defined hooks
hooks=None,
chief_only_hooks=ckpt_hooks,
save_checkpoint_secs=None,
save_summaries_steps=None,
save_summaries_secs=None,
config=sess_config)
parallax_log.debug(
"Created MonitoredTrainingSession for worker %d" % worker_id)
_init_global_vars(sess)
parallax_log.debug(
"Finished initialization process, start training on \
.prefetch(BUFFER))
data_length = len(train_df)
validation_length = len(validation_df)
def train_input_fn():
return train_data.make_one_shot_iterator().get_next()
def validation_input_fn():
return validation_data.make_one_shot_iterator().get_next()
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
rfig = tf.estimator.RunConfig(save_checkpoints_steps=10000, session_config=config)
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
model_dir = os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else None
params = {"learning_rate": LR}
# rfig = tf.estimator.RunConfig(save_checkpoints_steps=1000)
logger.info('Creating estimator with params: {}'.format(params))
model = tf.estimator.Estimator(model_fn=model_fn,
params=params,
model_dir=model_dir,
config=rfig)
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
if params.gpu_memory_frac_for_testing > 0:
config.gpu_options.per_process_gpu_memory_fraction = (
params.gpu_memory_frac_for_testing)
if params.xla:
config.graph_options.optimizer_options.global_jit_level = (
tf.OptimizerOptions.ON_1)
if params.enable_layout_optimizer:
config.graph_options.rewrite_options.layout_optimizer = (
rewriter_config_pb2.RewriterConfig.ON)
if params.rewriter_config:
rewriter_config = rewriter_config_pb2.RewriterConfig()
text_format.Merge(params.rewriter_config, rewriter_config)
config.graph_options.rewrite_options.CopyFrom(rewriter_config)
if params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
config.gpu_options.visible_device_list = str(hvd.local_rank())
return config
def _config(method):
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
if method == 'HOROVOD':
import horovod.tensorflow as hvd
config.gpu_options.visible_device_list = str(hvd.local_rank())
else:
config.gpu_options.visible_device_list = str(_get_cuda_index())
return config
if hvd.rank() == 0:
experiment.log_data_ref(data=train_data, data_name='x_train')
experiment.log_data_ref(data=train_labels, data_name='y_train')
experiment.log_data_ref(data=eval_data, data_name='x_test')
experiment.log_data_ref(data=eval_labels, data_name='y_test')
# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
# into (-1, 784) to feed into our network. Also, need to normalize the
# features between 0 and 1.
train_data = np.reshape(train_data, (-1, 784)) / 255.0
eval_data = np.reshape(eval_data, (-1, 784)) / 255.0
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None
# Create the Estimator
mnist_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn, model_dir=model_dir,
config=tf.estimator.RunConfig(session_config=config))
# Set up logging for predictions
# Log the values in the "Softmax" tensor with label "probabilities"
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=500)
# Download and load MNIST dataset.
(train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data(
"/tmp/MNIST-data-%d" % hvd.rank()
)
# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
# into (-1, 784) to feed into our network. Also, need to normalize the
# features between 0 and 1.
train_data = np.reshape(train_data, (-1, 784)) / 255.0
eval_data = np.reshape(eval_data, (-1, 784)) / 255.0
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
model_dir = "./mnist_convnet_model" if hvd.rank() == 0 else None
# Create the Estimator
mnist_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn,
model_dir=model_dir,
config=tf.estimator.RunConfig(session_config=config),
)
# Set up logging for predictions
# Log the values in the "Softmax" tensor with label "probabilities"
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=500)
def _get_runconfig(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(save_checkpoints_steps=None,
save_checkpoints_secs=None,
session_config=config)
else:
return tf.estimator.RunConfig(save_checkpoints_steps=None)