Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(_):
hvd.init()
FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank()))
FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()
tf.logging.set_verbosity(tf.logging.INFO)
if not FLAGS.do_train and not FLAGS.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tf.gfile.MakeDirs(FLAGS.output_dir)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Input Files ***")
for input_file in input_files:
tf.logging.info(" %s" % input_file)
If `cache_dir` and `load_model_from_dir` are the same and
`overwrite_model` is `False`, the fitted model is saved
to "cache_dir/fine_tuned". Defaults to False.
overwrite_model (bool, optional): Whether to overwrite an existing model.
If `cache_dir` and `load_model_from_dir` are the same and
`overwrite_model` is `False`, the fitted model is saved to
"cache_dir/fine_tuned". Defaults to False.
"""
# tb_writer = SummaryWriter()
# device = get_device("cpu" if num_gpus == 0 or not torch.cuda.is_available() else "gpu")
# self.model = move_to_device(self.model, device, num_gpus)
# hvd.init()
rank = hvd.rank()
local_rank = hvd.local_rank()
world_size = hvd.size()
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
is_master = rank == 0
self.cache_dir = self.cache_dir + "/distributed_" + str(local_rank)
self.model = self.model.to(device)
# t_total = len(train_dataloader) * num_epochs
# t_total = len(train_dataloader) // gradient_accumulation_steps * num_epochs
max_steps = 48000
def __init__(self, accumulation_step=1):
hvd.init()
self.local_rank = hvd.local_rank()
self.world_size = hvd.size()
self.rank = hvd.rank()
self.n_gpu = torch.cuda.device_count()
self.node_count = self.world_size // self.n_gpu
self.accumulation_step = accumulation_step
self.count_down = accumulation_step - 1
self._multi_node = self.node_count > 1
if not self._multi_node:
# use PyTorch build-in NCCL backend for single node training
torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000',
world_size=self.n_gpu, rank=self.local_rank)
Options loaded from default.py will be overridden by options loaded from cfg file
Options passed in through options argument will override option loaded from cfg file
Args:
*options (str,int ,optional): Options used to overide what is loaded from the config.
To see what options are available consult default.py
cfg (str, optional): Location of config file to load. Defaults to None.
"""
update_config(config, options=options, config_file=cfg)
hvd.init()
silence_other_ranks = True
logging.config.fileConfig(config.LOG_CONFIG)
logger = logging.getLogger(__name__)
torch.manual_seed(config.SEED)
torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(config.SEED)
rank, world_size = hvd.rank(), hvd.size()
scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
torch.manual_seed(config.SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(config.SEED)
np.random.seed(seed=config.SEED)
# Setup Augmentations
basic_aug = Compose(
[
Normalize(
mean=(config.TRAIN.MEAN,),
std=(config.TRAIN.STD,),
max_pixel_value=1,
filename_pattern = os.path.join(data_dir, '%s-*')
eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
num_eval_samples = _get_num_records(eval_filenames)
eval_idx_filenames = None
if data_idx_dir is not None:
filename_pattern = os.path.join(data_idx_dir, '%s-*')
eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
else:
raise ValueError("data_idx_dir must be specified")
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = 40 // hvd.size() - 2
classifier_eval = tf.estimator.Estimator(
model_fn=_cnn_model_function,
model_dir=log_dir,
params={
'model': infer_func,
'format': image_format,
'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
'momentum' : momentum,
'learning_rate_init' : learning_rate_init,
'learning_rate_power' : learning_rate_power,
'decay_steps' : None,
'weight_decay' : weight_decay,
if step == n_step:
break
tic = time.time()
if step != 0 and (step % lr_decay_every_step == 0):
new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))
[_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])
# tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
lr = sess.run(lr_v)
print(
'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
hvd.rank(), step, n_step, _loss, lr, _l2,
time.time() - tic))
for ix, ll in enumerate(_stage_losses):
print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)
# save intermediate results and model
if hvd.rank() == 0: # Horovod
if (step != 0) and (step % save_interval == 0):
# save some results
[img_out, confs_ground, pafs_ground, conf_result, paf_result,
mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
'train_%d_' % step)
# save model
# tl.files.save_npz(
# net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
tic = time.time()
if step != 0 and (step % lr_decay_every_step == 0):
new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))
[_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])
# tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
lr = sess.run(lr_v)
print(
'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
hvd.rank(), step, n_step, _loss, lr, _l2,
time.time() - tic))
for ix, ll in enumerate(_stage_losses):
print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)
# save intermediate results and model
if hvd.rank() == 0: # Horovod
if (step != 0) and (step % save_interval == 0):
# save some results
[img_out, confs_ground, pafs_ground, conf_result, paf_result,
mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
'train_%d_' % step)
# save model
# tl.files.save_npz(
# net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
# tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
tl.files.save_npz_dict(
net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
nstep = num_training_samples * num_iter // global_batch_size
decay_steps = nstep
else:
nstep = num_iter
num_epochs = max(nstep * global_batch_size // num_training_samples, 1)
decay_steps = 90 * num_training_samples // global_batch_size
nstep_per_epoch = num_training_samples // global_batch_size
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = max(2, 40//hvd.size()-2)
classifier = tf.estimator.Estimator(
model_fn=_cnn_model_function,
model_dir=log_dir,
params={
'model': infer_func,
'format': image_format,
'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
'momentum' : momentum,
'learning_rate_init' : learning_rate_init,
'learning_rate_power' : learning_rate_power,
'decay_steps' : decay_steps,
'weight_decay' : weight_decay,
'loss_scale' : loss_scale,
'larc_eta' : larc_eta,
'larc_mode' : larc_mode,
# initialization of all workers when training is started with random weights or
# restored from a checkpoint.
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=100,
num_epochs=None,
shuffle=True)
# Horovod: adjust number of steps based on number of GPUs.
mnist_classifier.train(
input_fn=train_input_fn,
steps=3000 // hvd.size(),
hooks=[logging_hook, bcast_hook])
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
num_epochs=1,
shuffle=False)
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)
# Polyaxon
if hvd.rank() == 0:
experiment.log_metrics(**eval_results)
eval_idx_filenames = None
if data_idx_dir is not None:
filename_pattern = os.path.join(data_idx_dir, '%s-*')
eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
else:
raise ValueError("data_idx_dir must be specified")
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = 40 // hvd.size() - 2
classifier_eval = tf.estimator.Estimator(
model_fn=_cnn_model_function,
model_dir=log_dir,
params={
'model': infer_func,
'format': image_format,
'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
'momentum' : momentum,
'learning_rate_init' : learning_rate_init,
'learning_rate_power' : learning_rate_power,
'decay_steps' : None,
'weight_decay' : weight_decay,
'loss_scale' : loss_scale,
'larc_eta' : larc_eta,
'larc_mode' : larc_mode,