Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if step == n_step:
break
tic = time.time()
if step != 0 and (step % lr_decay_every_step == 0):
new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))
[_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])
# tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
lr = sess.run(lr_v)
print(
'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
hvd.rank(), step, n_step, _loss, lr, _l2,
time.time() - tic))
for ix, ll in enumerate(_stage_losses):
print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)
# save intermediate results and model
if hvd.rank() == 0: # Horovod
if (step != 0) and (step % save_interval == 0):
# save some results
[img_out, confs_ground, pafs_ground, conf_result, paf_result,
mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
'train_%d_' % step)
# save model
# tl.files.save_npz(
# net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
tic = time.time()
if step != 0 and (step % lr_decay_every_step == 0):
new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))
[_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])
# tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
lr = sess.run(lr_v)
print(
'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
hvd.rank(), step, n_step, _loss, lr, _l2,
time.time() - tic))
for ix, ll in enumerate(_stage_losses):
print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)
# save intermediate results and model
if hvd.rank() == 0: # Horovod
if (step != 0) and (step % save_interval == 0):
# save some results
[img_out, confs_ground, pafs_ground, conf_result, paf_result,
mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
'train_%d_' % step)
# save model
# tl.files.save_npz(
# net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
# tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
tl.files.save_npz_dict(
net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
if cfg.TRAINER == 'replicated':
with ThreadPoolExecutor(max_workers=self.num_predictor) as executor, \
tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar:
futures = []
for dataflow, pred in zip(self.dataflows, self.predictors):
futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
all_results = list(itertools.chain(*[fut.result() for fut in futures]))
else:
if self._horovod_run_eval:
local_results = eval_coco(self.dataflow, self.predictor)
output_partial = os.path.join(
logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank()))
with open(output_partial, 'w') as f:
json.dump(local_results, f)
self.barrier.eval()
if hvd.rank() > 0:
return
all_results = []
for k in range(hvd.local_size()):
output_partial = os.path.join(
logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
with open(output_partial, 'r') as f:
obj = json.load(f)
all_results.extend(obj)
os.unlink(output_partial)
output_file = os.path.join(
logdir, 'outputs{}.json'.format(self.global_step))
with open(output_file, 'w') as f:
json.dump(all_results, f)
try:
scores = print_evaluation_scores(output_file)
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + url + '. Can you get to it with a browser?')
return filename
filename = maybe_download('text8-%d.zip' % hvd.rank(), 31344016)
# Read the data into a list of strings.
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words."""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
vocabulary = read_data(filename)
print('Data size', len(vocabulary))
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def rank0log(logger, *args, **kwargs):
if hvd.rank() == 0:
if logger:
logger.info(''.join([str(x) for x in list(args)]))
else:
print(*args, **kwargs)
def main(cfg):
######################################################################################
# Create Training Data
######################################################################################
datasets = build_dataset(cfg.data.train)
tf_datasets = [build_dataloader(datasets,
cfg.batch_size_per_device,
cfg.workers_per_gpu,
num_gpus=hvd.size(),
dist=True)]
dali_tdf = dali.dali_dataset(cfg.data.train.dataset_dir,
batch_size=cfg.batch_size_per_device,
device_id=hvd.rank(), num_gpus=hvd.size())
tf_datasets = [(dali_tdf, tf_datasets[0][1])]
######################################################################################
# Build Model
######################################################################################
model = build_detector(cfg.model,
train_cfg=cfg.train_cfg,
test_cfg=cfg.test_cfg)
# Pass example through so tensor shapes are defined
_ = model(next(iter(tf_datasets[0][0])), use_dali=True)
model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)
######################################################################################
# Create Model Runner
######################################################################################
runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name,
optimizer=cfg.optimizer, work_dir=cfg.work_dir,
logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16,
def print_act_stats(x, _str=""):
if not do_print_act_stats:
return x
if hvd.rank() != 0:
return x
if len(x.get_shape()) == 1:
x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
if len(x.get_shape()) == 2:
x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
if len(x.get_shape()) == 4:
x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
return tf.Print(x, stats, "["+_str+"] "+x.name)
def _rank(method):
if method == 'HOROVOD':
import horovod.tensorflow as hvd
return hvd.rank()
else:
return current_rank()
def train_fn(op, loss):
i = 0
total_loss = 0
cnt = 0
while True:
try:
[_, train_loss] = sess.run([op, loss])
i += 1
cnt += 1
total_loss += train_loss
# print("==device id {} global step {}".format(hvd.rank(), step))
if np.mod(i, num_storage_steps) == 0:
print(total_loss/cnt)
if hvd.rank() == 0:
model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc_{}.ckpt".format(int(i/num_storage_steps)))
cnt = 0
total_loss = 0
except tf.errors.OutOfRangeError:
print("End of dataset")
break
import time
label_element = nest.map_structure(
lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])
)
element = (input_element, label_element)
ds = tf.data.Dataset.from_tensors(element).repeat()
else:
shuffle_buffer_size = 10000
num_readers = 1
if hvd.size() > len(filenames):
assert (hvd.size() % len(filenames)) == 0
filenames = filenames * (hvd.size() / len(filenames))
ds = tf.data.Dataset.from_tensor_slices(filenames)
if shard:
# split the dataset into parts for each GPU
ds = ds.shard(hvd.size(), hvd.rank())
if not training:
# make sure all ranks have the same amount
ds = ds.take(take_count)
if training:
ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank()))
ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
counter = tf.data.Dataset.range(sys.maxsize)
ds = tf.data.Dataset.zip((ds, counter))
preproc_func = lambda record, counter_: parse_and_preprocess_image_record(
record,
counter_,
height,
width,