Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
filename_pattern = os.path.join(data_dir, '%s-*')
eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
num_eval_samples = _get_num_records(eval_filenames)
eval_idx_filenames = None
if data_idx_dir is not None:
filename_pattern = os.path.join(data_idx_dir, '%s-*')
eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
else:
raise ValueError("data_idx_dir must be specified")
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = 40 // hvd.size() - 2
classifier_eval = tf.estimator.Estimator(
model_fn=_cnn_model_function,
model_dir=log_dir,
params={
'model': infer_func,
'format': image_format,
'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
'momentum' : momentum,
'learning_rate_init' : learning_rate_init,
'learning_rate_power' : learning_rate_power,
'decay_steps' : None,
'weight_decay' : weight_decay,
# condition among the workers that share the same filesystem. If the
# directory already exists by the time this worker gets around to creating
# it, ignore the resulting exception and continue.
cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
if not os.path.exists(cache_dir):
try:
os.mkdir(cache_dir)
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
pass
else:
raise
# Download and load MNIST dataset.
(train_data, train_labels), (eval_data, eval_labels) = \
keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())
# Polyaxon
if hvd.rank() == 0:
experiment.log_data_ref(data=train_data, data_name='x_train')
experiment.log_data_ref(data=train_labels, data_name='y_train')
experiment.log_data_ref(data=eval_data, data_name='x_test')
experiment.log_data_ref(data=eval_labels, data_name='y_test')
# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
# into (-1, 784) to feed into our network. Also, need to normalize the
# features between 0 and 1.
train_data = np.reshape(train_data, (-1, 784)) / 255.0
eval_data = np.reshape(eval_data, (-1, 784)) / 255.0
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
if hvd.rank() < remainder:
start_index = hvd.rank() * (num_examples_per_rank+1)
end_index = start_index + num_examples_per_rank + 1
else:
start_index = hvd.rank() * num_examples_per_rank + remainder
end_index = start_index + (num_examples_per_rank)
model_fn = model_fn_builder(
bert_config=bert_config,
num_labels=len(label_list),
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config)
if FLAGS.do_train:
file_based_convert_examples_to_features(
train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
train_input_fn = file_based_input_fn_builder(
input_file=tmp_filenames,
num_batches_per_epoch=params["steps_per_epoch"],
num_gpus=params["num_gpus"]
)
tf.identity(learning_rate, name='learning_rate_ref')
tf.summary.scalar('learning_rate', learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])
if params["apply_loss_scaling"]:
optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])
if hvd_utils.is_using_hvd():
optimizer = hvd.DistributedOptimizer(optimizer)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if mode != tf.estimator.ModeKeys.TRAIN:
update_ops += [acc_top1_update_op, acc_top5_update_op]
deterministic = True
gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)
backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)
if self.model_hparams.use_dali:
train_ops = tf.group(backprop_op, update_ops, name='train_ops')
else:
def _eval(self):
logdir = self._output_dir
if cfg.TRAINER == 'replicated':
all_results = multithread_predict_dataflow(self.dataflows, self.predictors)
else:
filenames = [os.path.join(
logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
) for rank in range(hvd.local_size())]
if self._horovod_run_eval:
local_results = predict_dataflow(self.dataflow, self.predictor)
fname = filenames[hvd.local_rank()]
with open(fname, 'w') as f:
json.dump(local_results, f)
self.barrier.eval()
if hvd.rank() > 0:
return
all_results = []
for fname in filenames:
with open(fname, 'r') as f:
obj = json.load(f)
all_results.extend(obj)
os.unlink(fname)
output_file = os.path.join(
logdir, '{}-outputs{}.json'.format(self._eval_dataset, self.global_step))
scores = DetectionDataset().eval_or_save_inference_results(
all_results, self._eval_dataset, output_file)
for k, v in scores.items():
self.trainer.monitors.put_scalar(k, v)
cmdline = add_cli_args()
FLAGS, unknown_args = cmdline.parse_known_args()
ds = create_data(FLAGS.data_dir, FLAGS.synthetic, FLAGS.batch_size)
model = tf.keras.applications.ResNet50(weights=None, classes=1000)
opt = tf.keras.optimizers.SGD(learning_rate=FLAGS.learning_rate * hvd.size(), momentum=0.1)
loss_func = tf.keras.losses.SparseCategoricalCrossentropy()
loop_time = time()
if hvd.local_rank() == 0:
print("Step \t Throughput \t Loss")
for batch, (images, labels) in enumerate(ds):
loss = train_step(model, opt, loss_func, images, labels, batch==0)
if hvd.local_rank() == 0:
duration = time() - loop_time
loop_time = time()
throughput = (hvd.size()*FLAGS.batch_size)/duration
print("{} \t images/sec: {} \t {}".format(batch, throughput, loss))
if batch==FLAGS.num_batches:
break
if hvd.rank() == 0:
print("\nFinished in {}".format(time()-start))
"""
This function returns the name of the worker based on
the distribution strategy.
We do not use this function for MirroredStrategy.
Device names are used as worker names for this MirroredStrategy.
The names of the workers are managed by device_map in the case of this strategy.
It is safe to return the TORNASOLE_CONFIG_DEFAULT_WORKER_NAME in this case.
:return: str
"""
try:
import horovod.tensorflow as hvd
if hvd.size():
return f"worker_{hvd.rank()}"
except (ModuleNotFoundError, ValueError, ImportError):
pass
tf_config = os.getenv("TF_CONFIG")
if tf_config and is_parameter_server_strategy(tf_config):
return get_worker_id_from_tf_config(tf_config)
return CONFIG_DEFAULT_WORKER_NAME
def session_config():
optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L1,
do_function_inlining=True)
graph_options = tf.GraphOptions(optimizer_options=optimizer_options)
config = tf.ConfigProto(allow_soft_placement=True,
graph_options=graph_options)
config.gpu_options.visible_device_list = str(hvd.local_rank())
return config
'lc_beta': FLAGS.lc_beta,
'loss_scale': FLAGS.loss_scale,
'adv_bn_init': FLAGS.adv_bn_init,
'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None
},
config=tf.estimator.RunConfig(
# tf_random_seed=31 * (1 + hvd.rank()),
session_config=config,
save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None,
keep_checkpoint_max=None))
if not FLAGS.eval:
num_preproc_threads = FLAGS.num_parallel_calls
rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads)
training_hooks = [hvd.BroadcastGlobalVariablesHook(0),
PrefillStagingAreasHook()]
if hvd.rank() == 0:
training_hooks.append(
LogSessionRunHook(global_batch_size,
num_training_samples,
FLAGS.display_every, logger))
try:
start_time = time.time()
classifier.train(
input_fn=lambda: make_dataset(
train_filenames,
training_samples_per_rank,
FLAGS.batch_size, height, width,
FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue,
training=True, num_threads=num_preproc_threads,
shard=True, synthetic=FLAGS.synthetic, increased_aug=FLAGS.increased_aug),