Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'as `None`, then the model_name flag is used.')
tf.app.flags.DEFINE_float(
'moving_average_decay', None,
'The decay to use for the moving average.'
'If left as None, then moving averages are not used.')
tf.app.flags.DEFINE_integer(
'eval_image_size', None, 'Eval image size')
FLAGS = tf.app.flags.FLAGS
if not FLAGS.dataset_dir:
raise ValueError('You must supply the dataset directory with --dataset_dir')
cluster_spec, server = TFNode.start_cluster_server(ctx)
tf.logging.set_verbosity(tf.logging.INFO)
with tf.Graph().as_default():
#tf_global_step = slim.get_or_create_global_step()
tf_global_step = tf.Variable(0, name="global_step")
######################
# Select the dataset #
######################
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
####################
# Select the model #
####################
network_fn = nets_factory.get_network_fn(
def start_cluster_server(self, num_gpus=1, rdma=False):
"""Convenience function to access ``TFNode.start_cluster_server`` directly from this object instance."""
return TFNode.start_cluster_server(self, num_gpus, rdma)
from inception import inception_distributed_train
from inception.imagenet_data import ImagenetData
import tensorflow as tf
# instantiate FLAGS on workers using argv from driver and add job_name and task_id
print("argv:", argv)
sys.argv = argv
FLAGS = tf.app.flags.FLAGS
FLAGS.job_name = job_name
FLAGS.task_id = task_index
print("FLAGS:", FLAGS.__dict__['__flags'])
# Get TF cluster and server instances
cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)
if FLAGS.job_name == 'ps':
# `ps` jobs wait for incoming connections from the workers.
server.join()
else:
# `worker` jobs will actually do the work.
dataset = ImagenetData(subset=FLAGS.subset)
assert dataset.data_files()
# Only the chief checks for or creates train_dir.
if FLAGS.task_id == 0:
if not tf.gfile.Exists(FLAGS.train_dir):
tf.gfile.MakeDirs(FLAGS.train_dir)
inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
"""Directory where to write event logs.""")
tf.app.flags.DEFINE_string('eval_data', 'test',
"""Either 'test' or 'train_eval'.""")
tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
"""Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
"""How often to run the eval.""")
tf.app.flags.DEFINE_integer('num_examples', 10000,
"""Number of examples to run.""")
tf.app.flags.DEFINE_boolean('run_once', False,
"""Whether to run eval only once.""")
tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")
cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)
def eval_once(saver, summary_writer, top_k_op, summary_op):
"""Run Eval once.
Args:
saver: Saver.
summary_writer: Summary writer.
top_k_op: Top K op.
summary_op: Summary op.
"""
with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
# Assuming model_checkpoint_path looks something like:
def map_fun(args, ctx):
num_workers = args.cluster_size if args.driver_ps_nodes else args.cluster_size - args.num_ps
worker_num = ctx.worker_num
job_name = ctx.job_name
task_index = ctx.task_index
# Get TF cluster and server instances
cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma')
def _parse_tfr(example_proto):
feature_def = {"label": tf.FixedLenFeature(10, tf.int64),
"image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)}
features = tf.parse_single_example(example_proto, feature_def)
norm = tf.constant(255, dtype=tf.float32, shape=(784,))
image = tf.div(tf.to_float(features['image']), norm)
label = tf.to_float(features['label'])
return (image, label)
if job_name == "ps":
server.join()
elif job_name == "worker":
# Assigns ops to the local worker by default.
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % task_index,
import numpy
import os
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.datasets import mnist
from tensorflow.python.keras.models import Sequential, load_model, save_model
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import LambdaCallback, TensorBoard
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
from tensorflowonspark import TFNode
cluster, server = TFNode.start_cluster_server(ctx)
if ctx.job_name == "ps":
server.join()
elif ctx.job_name == "worker":
def generate_rdd_data(tf_feed, batch_size):
print("generate_rdd_data invoked")
while True:
batch = tf_feed.next_batch(batch_size)
imgs = []
lbls = []
for item in batch:
imgs.append(item[0])
lbls.append(item[1])
images = numpy.array(imgs).astype('float32') / 255
labels = numpy.array(lbls).astype('float32')
worker_num = ctx.worker_num
job_name = ctx.job_name
task_index = ctx.task_index
IMAGE_PIXELS = 28
# Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
if job_name == "ps":
time.sleep((worker_num + 1) * 5)
# Parameters
hidden_units = 128
batch_size = args.batch_size
# Get TF cluster and server instances
cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)
def feed_dict(batch):
# Convert from [(images, labels)] to two numpy arrays of the proper type
images = []
labels = []
for item in batch:
images.append(item[0])
labels.append(item[1])
xs = numpy.array(images)
xs = xs.astype(numpy.float32)
xs = xs / 255.0
ys = numpy.array(labels)
ys = ys.astype(numpy.uint8)
return (xs, ys)
if job_name == "ps":
from tensorflowonspark import TFNode
from datetime import datetime
import math
import numpy
import tensorflow as tf
worker_num = ctx.worker_num
job_name = ctx.job_name
task_index = ctx.task_index
# Parameters
IMAGE_PIXELS = 28
hidden_units = 128
# Get TF cluster and server instances
cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)
# Create generator for Spark data feed
tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
def rdd_generator():
while not tf_feed.should_stop():
batch = tf_feed.next_batch(1)[0]
image = numpy.array(batch[0])
image = image.astype(numpy.float32) / 255.0
label = numpy.array(batch[1])
label = label.astype(numpy.int64)
yield (image, label)
if job_name == "ps":
server.join()
elif job_name == "worker":