How to use the tensorflowonspark.TFNode.start_cluster_server function in tensorflowonspark

To help you get started, we’ve selected a few tensorflowonspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github yahoo / TensorFlowOnSpark / examples / slim / eval_image_classifier.py View on Github external
'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  tf.app.flags.DEFINE_integer(
      'eval_image_size', None, 'Eval image size')

  FLAGS = tf.app.flags.FLAGS

  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #tf_global_step = slim.get_or_create_global_step()
    tf_global_step = tf.Variable(0, name="global_step")

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
github yahoo / TensorFlowOnSpark / tensorflowonspark / TFSparkNode.py View on Github external
def start_cluster_server(self, num_gpus=1, rdma=False):
    """Convenience function to access ``TFNode.start_cluster_server`` directly from this object instance."""
    return TFNode.start_cluster_server(self, num_gpus, rdma)
github yahoo / TensorFlowOnSpark / examples / imagenet / inception / imagenet_distributed_train_pipeline.py View on Github external
from inception import inception_distributed_train
  from inception.imagenet_data import ImagenetData
  import tensorflow as tf

  # instantiate FLAGS on workers using argv from driver and add job_name and task_id
  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = job_name
  FLAGS.task_id = task_index
  print("FLAGS:", FLAGS.__dict__['__flags'])

  # Get TF cluster and server instances
  cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
github yahoo / TensorFlowOnSpark / examples / cifar10 / cifar10_eval.py View on Github external
FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
                             """Directory where to write event logs.""")
  tf.app.flags.DEFINE_string('eval_data', 'test',
                             """Either 'test' or 'train_eval'.""")
  tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
                             """Directory where to read model checkpoints.""")
  tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                              """How often to run the eval.""")
  tf.app.flags.DEFINE_integer('num_examples', 10000,
                              """Number of examples to run.""")
  tf.app.flags.DEFINE_boolean('run_once', False,
                           """Whether to run eval only once.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  def eval_once(saver, summary_writer, top_k_op, summary_op):
    """Run Eval once.

    Args:
      saver: Saver.
      summary_writer: Summary writer.
      top_k_op: Top K op.
      summary_op: Summary op.
    """
    with tf.Session() as sess:
      ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
      if ckpt and ckpt.model_checkpoint_path:
        # Restores from checkpoint
        saver.restore(sess, ckpt.model_checkpoint_path)
        # Assuming model_checkpoint_path looks something like:
github yahoo / TensorFlowOnSpark / examples / mnist / tf / mnist_dist_pipeline.py View on Github external
def map_fun(args, ctx):
  num_workers = args.cluster_size if args.driver_ps_nodes else args.cluster_size - args.num_ps
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma')

  def _parse_tfr(example_proto):
    feature_def = {"label": tf.FixedLenFeature(10, tf.int64),
                   "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)}
    features = tf.parse_single_example(example_proto, feature_def)
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(tf.to_float(features['image']), norm)
    label = tf.to_float(features['label'])
    return (image, label)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
      worker_device="/job:worker/task:%d" % task_index,
github yahoo / TensorFlowOnSpark / examples / mnist / keras / mnist_mlp.py View on Github external
import numpy
  import os
  import tensorflow as tf
  from tensorflow.python import keras
  from tensorflow.python.keras import backend as K
  from tensorflow.python.keras.datasets import mnist
  from tensorflow.python.keras.models import Sequential, load_model, save_model
  from tensorflow.python.keras.layers import Dense, Dropout
  from tensorflow.python.keras.optimizers import RMSprop
  from tensorflow.python.keras.callbacks import LambdaCallback, TensorBoard
  from tensorflow.python.saved_model import builder as saved_model_builder
  from tensorflow.python.saved_model import tag_constants
  from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
  from tensorflowonspark import TFNode

  cluster, server = TFNode.start_cluster_server(ctx)

  if ctx.job_name == "ps":
    server.join()
  elif ctx.job_name == "worker":

    def generate_rdd_data(tf_feed, batch_size):
        print("generate_rdd_data invoked")
        while True:
            batch = tf_feed.next_batch(batch_size)
            imgs = []
            lbls = []
            for item in batch:
                imgs.append(item[0])
                lbls.append(item[1])
            images = numpy.array(imgs).astype('float32') / 255
            labels = numpy.array(lbls).astype('float32')
github yahoo / TensorFlowOnSpark / examples / mnist / streaming / mnist_dist.py View on Github external
worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  IMAGE_PIXELS = 28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs / 255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
github yahoo / TensorFlowOnSpark / examples / mnist / spark / mnist_dist_dataset.py View on Github external
from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  # Parameters
  IMAGE_PIXELS = 28
  hidden_units = 128

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  # Create generator for Spark data feed
  tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")

  def rdd_generator():
    while not tf_feed.should_stop():
      batch = tf_feed.next_batch(1)[0]
      image = numpy.array(batch[0])
      image = image.astype(numpy.float32) / 255.0
      label = numpy.array(batch[1])
      label = label.astype(numpy.int64)
      yield (image, label)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":