Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# step size adjustments
step_num += 1
if step_num < num_warmup_steps:
new_lr = config.learning_rate * step_num / num_warmup_steps
else:
offset = ((step_num - num_warmup_steps) * config.learning_rate /
(num_train_steps - num_warmup_steps))
new_lr = config.learning_rate - offset
trainer.set_learning_rate(new_lr)
with mx.autograd.record():
loss_value = loss_function(out, tag_ids,
flag_nonnull_tag.expand_dims(axis=2)).mean()
loss_value.backward()
nlp.utils.clip_grad_global_norm(params, 1)
trainer.step(1)
pred_tags = out.argmax(axis=-1)
logging.info('loss_value: %6f', loss_value.asscalar())
num_tag_preds = flag_nonnull_tag.sum().asscalar()
logging.info(
'accuracy: %6f', (((pred_tags == tag_ids) * flag_nonnull_tag).sum().asscalar()
/ num_tag_preds))
return step_num
]).sum() / num_labels
if accumulate:
loss = loss / accumulate
if args.dtype == 'float16':
with amp.scale_loss(loss, trainer) as l:
mx.autograd.backward(l)
norm_clip = 1.0 * size * trainer._amp_loss_scaler.loss_scale
else:
mx.autograd.backward(loss)
norm_clip = 1.0 * size
# update
if not accumulate or (batch_id + 1) % accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, norm_clip)
trainer.update(1)
if accumulate:
param_dict.zero_grad()
if args.comm_backend == 'horovod':
step_loss += hvd.allreduce(loss, average=True).asscalar()
else:
step_loss += loss.asscalar()
if (batch_id + 1) % log_interval == 0:
toc = time.time()
log.info('Batch: {}/{}, Loss={:.4f}, lr={:.7f} '
'Thoughput={:.2f} samples/s'
.format(batch_id % len(train_dataloader),
len(train_dataloader), step_loss / log_interval,
trainer.learning_rate, log_num/(toc - tic)))
out = net(inputs.astype('float32').as_in_context(ctx),
token_types.astype('float32').as_in_context(ctx),
valid_length.astype('float32').as_in_context(ctx))
ls = loss_function(out, [
start_label.astype('float32').as_in_context(ctx),
end_label.astype('float32').as_in_context(ctx)]).mean()
if accumulate:
ls = ls / accumulate
ls.backward()
# update
if not accumulate or (batch_id + 1) % accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, 1)
trainer.update(1)
step_loss += ls.asscalar()
if (batch_id + 1) % log_interval == 0:
toc = time.time()
log.info('Epoch: {}, Batch: {}/{}, Loss={:.4f}, lr={:.7f} Time cost={:.1f} Thoughput={:.2f} samples/s' # pylint: disable=line-too-long
.format(epoch_id, batch_id, len(train_dataloader),
step_loss / log_interval,
trainer.learning_rate, toc - tic, log_num/(toc - tic)))
tic = time.time()
step_loss = 0.0
log_num = 0
epoch_toc = time.time()
log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
epoch_toc - epoch_tic, total_num/(epoch_toc - epoch_tic)))
inputs,
token_types,
valid_length,
[start_label, end_label],
p_mask=p_mask, # pylint: disable=line-too-long
is_impossible=is_impossible)
ls = out.mean() / len(ctx)
batch_loss_sep.append(out_sep)
batch_loss.append(ls)
if args.accumulate:
ls = ls / args.accumulate
ls.backward()
# update
if not args.accumulate or (batch_id + 1) % args.accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, 1)
_apply_gradient_decay()
trainer.update(1, ignore_stale_grad=True)
step_loss_sep_tmp = np.array(
[[span_ls.mean().asscalar(),
cls_ls.mean().asscalar()] for span_ls, cls_ls in batch_loss_sep])
step_loss_sep_tmp = list(np.sum(step_loss_sep_tmp, axis=0))
step_loss_span += step_loss_sep_tmp[0] / len(ctx)
step_loss_cls += step_loss_sep_tmp[1] / len(ctx)
step_loss += sum([ls.asscalar() for ls in batch_loss])
if (batch_id + 1) % log_interval == 0:
toc = time.time()
log.info(
'Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
'Time cost=%.1f Thoughput=%.2f samples/s', epoch_id + 1, batch_id + 1,
label = label.as_in_context(ctx)
if use_roberta:
out = model(input_ids, valid_length)
else:
out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
ls = loss_function(out, label).mean()
if args.dtype == 'float16':
with amp.scale_loss(ls, trainer) as scaled_loss:
mx.autograd.backward(scaled_loss)
else:
ls.backward()
# update
if not accumulate or (batch_id + 1) % accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, 1)
trainer.update(accumulate if accumulate else 1)
step_num += 1
if accumulate and accumulate > 1:
# set grad to zero for gradient accumulation
all_model_params.zero_grad()
step_loss += ls.asscalar()
if not do_regression:
label = label.reshape((-1))
metric.update([label], [out])
if (batch_id + 1) % (args.log_interval) == 0:
log_train(batch_id, len(train_data), metric, step_loss, args.log_interval,
epoch_id, trainer.learning_rate)
step_loss = 0
if step_num >= num_train_steps:
logging.info('Finish training step: %d', step_num)
data_list = list(split_and_load(seqs, ctxs))
for splited_data in data_list:
input_ids, valid_length, segment_ids, label = splited_data
out = model(input_ids,
segment_ids,
valid_length=valid_length)
ls = loss_function(out, label).mean() / len(ctxs)
batch_loss.append(ls)
if args.accumulate:
ls = ls / args.accumulate
ls.backward()
# update
if not args.accumulate or (batch_id +
1) % args.accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, 1)
trainer.update(args.accumulate if args.accumulate else 1,
ignore_stale_grad=True)
step_num += 1
if args.accumulate and args.accumulate > 1:
# set grad to zero for gradient accumulation
all_model_params.zero_grad()
if batch_id == 0 and epoch_id == 0:
toc = time.time()
logging.info(
'Time cost for the first forward-backward =%.2fs',
toc - tic)
batch_loss = sum([ls.asscalar() for ls in batch_loss])
step_loss += batch_loss
if (batch_id + 1) % (args.log_interval) == 0:
log_train(batch_id, len(train_data), step_loss,
args.log_interval, epoch_id,
label = label.as_in_context(ctx)
if use_roberta:
out = model(input_ids, valid_length)
else:
out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
ls = loss_function(out, label).mean()
if args.dtype == 'float16':
with amp.scale_loss(ls, trainer) as scaled_loss:
mx.autograd.backward(scaled_loss)
else:
ls.backward()
# update
if not accumulate or (batch_id + 1) % accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, 1)
trainer.update(accumulate if accumulate else 1)
step_num += 1
if accumulate and accumulate > 1:
# set grad to zero for gradient accumulation
all_model_params.zero_grad()
step_loss += ls.asscalar()
task.metric.update([label], [out])
if (batch_id + 1) % (args.log_interval) == 0:
log_train(batch_id, len(train_data), task.metric, step_loss, args.log_interval,
epoch_id, trainer.learning_rate, tbar)
step_loss = 0
mx.nd.waitall()
# inference on dev data
for segment, dev_data in dev_data_list:
out = net(inputs.astype('float32').as_in_context(ctx),
token_types.astype('float32').as_in_context(ctx),
valid_length.astype('float32').as_in_context(ctx))
ls = loss_function(out, [
start_label.astype('float32').as_in_context(ctx),
end_label.astype('float32').as_in_context(ctx)]).mean()
if accumulate:
ls = ls / accumulate
ls.backward()
# update
if not accumulate or (batch_id + 1) % accumulate == 0:
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(params, 1)
trainer.update(1)
step_loss += ls.asscalar()
if (batch_id + 1) % log_interval == 0:
toc = time.time()
log.info('Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
'Time cost=%.1f Thoughput=%.2f samples/s',
epoch_id, batch_id, len(train_dataloader),
step_loss / log_interval,
trainer.learning_rate, toc - tic, log_num / (toc - tic))
tic = time.time()
step_loss = 0.0
log_num = 0
epoch_toc = time.time()
log.info('Epoch: {}, Time cost={:.2f} s, Thoughput={:.2f} samples/s'