How to use the horovod.tensorflow.local_rank function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github HewlettPackard / dlcookbook-dlbs / python / nvcnn_hvd_benchmarks / nvcnn_hvd.py View on Github external
evaluator = FeedForwardEvaluator(preprocessor, eval_func)
        print("Building evaluation graph")
        top1_op, top5_op, enqueue_ops = evaluator.evaluation_step(batch_size)
    else:
        nstep_per_epoch = nrecord // (batch_size * hvd.size())
        trainer = FeedForwardTrainer(preprocessor, loss_func, nstep_per_epoch)
        print_r0("Building training graph")
        total_loss, learning_rate, train_ops = trainer.training_step(
            batch_size)

    print_r0("Creating session")
    config = tf.ConfigProto()
    config.intra_op_parallelism_threads = 1
    config.inter_op_parallelism_threads = 10
    config.gpu_options.force_gpu_compatible = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    sess = tf.Session(config=config)

    train_writer = None
    saver = None
    summary_ops = None
    if hvd.rank() == 0 and len(FLAGS.log_dir):
        log_dir = FLAGS.log_dir
        train_writer = tf.summary.FileWriter(log_dir, sess.graph)
        summary_ops = tf.summary.merge_all()
        last_summary_time = time.time()
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=3)
        last_save_time = time.time()

    if not FLAGS.eval:
        print_r0("Initializing variables")
github NVIDIA / DeepLearningExamples / TensorFlow / Detection / SSD / models / research / object_detection / model_main.py View on Github external
def main(unused_argv):
  tf.logging.set_verbosity(tf.logging.INFO)

  hvd.init()

  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')
  session_config = tf.ConfigProto()
  session_config.gpu_options.per_process_gpu_memory_fraction=0.9
  session_config.gpu_options.visible_device_list = str(hvd.local_rank())
  if FLAGS.allow_xla:
      session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
  model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
  config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config)

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
      eval_count=FLAGS.eval_count,
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
      sample_1_of_n_eval_on_train_examples=(
          FLAGS.sample_1_of_n_eval_on_train_examples))
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
github rlgraph / rlgraph / rlgraph / graphs / tensorflow_executor.py View on Github external
def setup_horovod_execution(self):
        """
        Sets up Horovod.
        """
        # Check again to avoid import if unset which will crash if horovod is not installed.
        if get_distributed_backend() == "horovod":
            import horovod.tensorflow as hvd
            self.logger.info("Setting up Horovod execution.")
            hvd.init()
            config = tf.ConfigProto()
            config.gpu_options.visible_device_list = str(hvd.local_rank())
github snuspl / parallax / parallax / parallax / core / python / mpi / runner.py View on Github external
if config.profile_config.profile_worker != None and worker_id != config.profile_config.profile_worker:
                #Only one CUPTI profiler can run in a machine
                #See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452
                config.profile_config.profile_dir = None
            else:
                config.profile_config.profile_dir = \
                    os.path.join(config.profile_config.profile_dir, hostname,
                                 'worker:%d'%worker_id, 'run_meta')      
 
        ckpt_hooks = build_ckpt_hooks(config.get_ckpt_config()) if worker_id == 0 else None

        sess_config = config.sess_config
        if sess_config is None:
            sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
        sess = tf.train.MonitoredTrainingSession(
                is_chief=True,
                checkpoint_dir=config.get_ckpt_config().ckpt_dir if worker_id == 0 else None,
                # TODO: Allow user-defined hooks
                hooks=None,
                chief_only_hooks=ckpt_hooks,
                save_checkpoint_secs=None,
                save_summaries_steps=None,
                save_summaries_secs=None,
                config=sess_config)

        parallax_log.debug(
            "Created MonitoredTrainingSession for worker %d" % worker_id)
        _init_global_vars(sess)
        parallax_log.debug(
            "Finished initialization process, start training on \
github microsoft / DistributedDeepLearning / ImagenetEstimatorHorovod.py View on Github external
.prefetch(BUFFER))


    data_length = len(train_df)
    validation_length = len(validation_df)

    def train_input_fn():
        return train_data.make_one_shot_iterator().get_next()

    def validation_input_fn():
        return validation_data.make_one_shot_iterator().get_next()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    rfig = tf.estimator.RunConfig(save_checkpoints_steps=10000, session_config=config)

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else None

    params = {"learning_rate": LR}
    # rfig = tf.estimator.RunConfig(save_checkpoints_steps=1000)
    logger.info('Creating estimator with params: {}'.format(params))
    model = tf.estimator.Estimator(model_fn=model_fn,
                                   params=params,
                                   model_dir=model_dir,
                                   config=rfig)

    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
github HewlettPackard / dlcookbook-dlbs / python / tf_cnn_benchmarks / benchmark_cnn.py View on Github external
if params.gpu_memory_frac_for_testing > 0:
    config.gpu_options.per_process_gpu_memory_fraction = (
        params.gpu_memory_frac_for_testing)
  if params.xla:
    config.graph_options.optimizer_options.global_jit_level = (
        tf.OptimizerOptions.ON_1)
  if params.enable_layout_optimizer:
    config.graph_options.rewrite_options.layout_optimizer = (
        rewriter_config_pb2.RewriterConfig.ON)
  if params.rewriter_config:
    rewriter_config = rewriter_config_pb2.RewriterConfig()
    text_format.Merge(params.rewriter_config, rewriter_config)
    config.graph_options.rewrite_options.CopyFrom(rewriter_config)
  if params.variable_update == 'horovod':
    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
    config.gpu_options.visible_device_list = str(hvd.local_rank())

  return config
github lsds / KungFu / srcs / python / kungfu / tensorflow / v1 / benchmarks / __main__.py View on Github external
def _config(method):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    if method == 'HOROVOD':
        import horovod.tensorflow as hvd
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    else:
        config.gpu_options.visible_device_list = str(_get_cuda_index())
    return config
github polyaxon / polyaxon / examples / in_cluster / horovod / tensorflow / mnist.py View on Github external
if hvd.rank() == 0:
        experiment.log_data_ref(data=train_data, data_name='x_train')
        experiment.log_data_ref(data=train_labels, data_name='y_train')
        experiment.log_data_ref(data=eval_data, data_name='x_test')
        experiment.log_data_ref(data=eval_labels, data_name='y_test')

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=500)
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / horovod_mnist_estimator.py View on Github external
# Download and load MNIST dataset.
    (train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data(
        "/tmp/MNIST-data-%d" % hvd.rank()
    )

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = "./mnist_convnet_model" if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config),
    )

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=500)
github microsoft / DistributedDeepLearning / HorovodTF / src / imagenet_estimator_tf_horovod.py View on Github external
def _get_runconfig(is_distributed=_DISTRIBUTED):
    if is_distributed:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

        return tf.estimator.RunConfig(save_checkpoints_steps=None,
                                      save_checkpoints_secs=None,
                                      session_config=config)
    else:
        return tf.estimator.RunConfig(save_checkpoints_steps=None)