How to use the gluonnlp.utils.clip_grad_global_norm function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / scripts / ner / finetune_bert.py View on Github external
# step size adjustments
            step_num += 1
            if step_num < num_warmup_steps:
                new_lr = config.learning_rate * step_num / num_warmup_steps
            else:
                offset = ((step_num - num_warmup_steps) * config.learning_rate /
                          (num_train_steps - num_warmup_steps))
                new_lr = config.learning_rate - offset
            trainer.set_learning_rate(new_lr)

            with mx.autograd.record():
                loss_value = loss_function(out, tag_ids,
                                           flag_nonnull_tag.expand_dims(axis=2)).mean()

            loss_value.backward()
            nlp.utils.clip_grad_global_norm(params, 1)
            trainer.step(1)

            pred_tags = out.argmax(axis=-1)
            logging.info('loss_value: %6f', loss_value.asscalar())

            num_tag_preds = flag_nonnull_tag.sum().asscalar()
            logging.info(
                'accuracy: %6f', (((pred_tags == tag_ids) * flag_nonnull_tag).sum().asscalar()
                                  / num_tag_preds))
        return step_num
github dmlc / gluon-nlp / scripts / bert / finetune_squad.py View on Github external
]).sum() / num_labels

                if accumulate:
                    loss = loss / accumulate
                if args.dtype == 'float16':
                    with amp.scale_loss(loss, trainer) as l:
                        mx.autograd.backward(l)
                        norm_clip = 1.0 * size * trainer._amp_loss_scaler.loss_scale
                else:
                    mx.autograd.backward(loss)
                    norm_clip = 1.0 * size

            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, norm_clip)
                trainer.update(1)
                if accumulate:
                    param_dict.zero_grad()

            if args.comm_backend == 'horovod':
                step_loss += hvd.allreduce(loss, average=True).asscalar()
            else:
                step_loss += loss.asscalar()

            if (batch_id + 1) % log_interval == 0:
                toc = time.time()
                log.info('Batch: {}/{}, Loss={:.4f}, lr={:.7f} '
                         'Thoughput={:.2f} samples/s'
                         .format(batch_id % len(train_dataloader),
                                 len(train_dataloader), step_loss / log_interval,
                                 trainer.learning_rate, log_num/(toc - tic)))
github eric-haibin-lin / AMLC19-GluonNLP / 05_deployment / bert / finetune_squad.py View on Github external
out = net(inputs.astype('float32').as_in_context(ctx),
                          token_types.astype('float32').as_in_context(ctx),
                          valid_length.astype('float32').as_in_context(ctx))

                ls = loss_function(out, [
                    start_label.astype('float32').as_in_context(ctx),
                    end_label.astype('float32').as_in_context(ctx)]).mean()

                if accumulate:
                    ls = ls / accumulate
            ls.backward()
            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                trainer.update(1)

            step_loss += ls.asscalar()

            if (batch_id + 1) % log_interval == 0:
                toc = time.time()
                log.info('Epoch: {}, Batch: {}/{}, Loss={:.4f}, lr={:.7f} Time cost={:.1f} Thoughput={:.2f} samples/s'  # pylint: disable=line-too-long
                         .format(epoch_id, batch_id, len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, toc - tic, log_num/(toc - tic)))
                tic = time.time()
                step_loss = 0.0
                log_num = 0
        epoch_toc = time.time()
        log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
            epoch_toc - epoch_tic, total_num/(epoch_toc - epoch_tic)))
github dmlc / gluon-nlp / scripts / language_model / run_squad.py View on Github external
inputs,
                        token_types,
                        valid_length,
                        [start_label, end_label],
                        p_mask=p_mask,  # pylint: disable=line-too-long
                        is_impossible=is_impossible)
                    ls = out.mean() / len(ctx)
                    batch_loss_sep.append(out_sep)
                    batch_loss.append(ls)
                    if args.accumulate:
                        ls = ls / args.accumulate
                    ls.backward()
            # update
            if not args.accumulate or (batch_id + 1) % args.accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                _apply_gradient_decay()
                trainer.update(1, ignore_stale_grad=True)

                step_loss_sep_tmp = np.array(
                    [[span_ls.mean().asscalar(),
                      cls_ls.mean().asscalar()] for span_ls, cls_ls in batch_loss_sep])
                step_loss_sep_tmp = list(np.sum(step_loss_sep_tmp, axis=0))
                step_loss_span += step_loss_sep_tmp[0] / len(ctx)
                step_loss_cls += step_loss_sep_tmp[1] / len(ctx)

            step_loss += sum([ls.asscalar() for ls in batch_loss])
            if (batch_id + 1) % log_interval == 0:
                toc = time.time()
                log.info(
                    'Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
                    'Time cost=%.1f Thoughput=%.2f samples/s', epoch_id + 1, batch_id + 1,
github dmlc / gluon-nlp / scripts / bert / finetune_classifier.py View on Github external
label = label.as_in_context(ctx)
                    if use_roberta:
                        out = model(input_ids, valid_length)
                    else:
                        out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
                    ls = loss_function(out, label).mean()
                    if args.dtype == 'float16':
                        with amp.scale_loss(ls, trainer) as scaled_loss:
                            mx.autograd.backward(scaled_loss)
                    else:
                        ls.backward()

                # update
                if not accumulate or (batch_id + 1) % accumulate == 0:
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(accumulate if accumulate else 1)
                    step_num += 1
                    if accumulate and accumulate > 1:
                        # set grad to zero for gradient accumulation
                        all_model_params.zero_grad()

                step_loss += ls.asscalar()
                if not do_regression:
                    label = label.reshape((-1))
                metric.update([label], [out])
                if (batch_id + 1) % (args.log_interval) == 0:
                    log_train(batch_id, len(train_data), metric, step_loss, args.log_interval,
                              epoch_id, trainer.learning_rate)
                    step_loss = 0
                if step_num >= num_train_steps:
                    logging.info('Finish training step: %d', step_num)
github dmlc / gluon-nlp / scripts / language_model / run_glue.py View on Github external
data_list = list(split_and_load(seqs, ctxs))
                    for splited_data in data_list:
                        input_ids, valid_length, segment_ids, label = splited_data
                        out = model(input_ids,
                                    segment_ids,
                                    valid_length=valid_length)
                        ls = loss_function(out, label).mean() / len(ctxs)
                        batch_loss.append(ls)
                        if args.accumulate:
                            ls = ls / args.accumulate
                        ls.backward()
                # update
                if not args.accumulate or (batch_id +
                                           1) % args.accumulate == 0:
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(args.accumulate if args.accumulate else 1,
                                   ignore_stale_grad=True)
                    step_num += 1
                    if args.accumulate and args.accumulate > 1:
                        # set grad to zero for gradient accumulation
                        all_model_params.zero_grad()
                    if batch_id == 0 and epoch_id == 0:
                        toc = time.time()
                        logging.info(
                            'Time cost for the first forward-backward =%.2fs',
                            toc - tic)
                batch_loss = sum([ls.asscalar() for ls in batch_loss])
                step_loss += batch_loss
                if (batch_id + 1) % (args.log_interval) == 0:
                    log_train(batch_id, len(train_data), step_loss,
                              args.log_interval, epoch_id,
github awslabs / autogluon / autogluon / task / text_classification / pipeline.py View on Github external
label = label.as_in_context(ctx)
                if use_roberta:
                    out = model(input_ids, valid_length)
                else:
                    out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
                ls = loss_function(out, label).mean()
                if args.dtype == 'float16':
                    with amp.scale_loss(ls, trainer) as scaled_loss:
                        mx.autograd.backward(scaled_loss)
                else:
                    ls.backward()

            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                trainer.update(accumulate if accumulate else 1)
                step_num += 1
                if accumulate and accumulate > 1:
                    # set grad to zero for gradient accumulation
                    all_model_params.zero_grad()

            step_loss += ls.asscalar()
            task.metric.update([label], [out])
            if (batch_id + 1) % (args.log_interval) == 0:
                log_train(batch_id, len(train_data), task.metric, step_loss, args.log_interval,
                          epoch_id, trainer.learning_rate, tbar)
                step_loss = 0
        mx.nd.waitall()

        # inference on dev data
        for segment, dev_data in dev_data_list:
github dmlc / gluon-nlp / scripts / bert / staticbert / static_finetune_squad.py View on Github external
out = net(inputs.astype('float32').as_in_context(ctx),
                          token_types.astype('float32').as_in_context(ctx),
                          valid_length.astype('float32').as_in_context(ctx))

                ls = loss_function(out, [
                    start_label.astype('float32').as_in_context(ctx),
                    end_label.astype('float32').as_in_context(ctx)]).mean()

                if accumulate:
                    ls = ls / accumulate
            ls.backward()
            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                trainer.update(1)

            step_loss += ls.asscalar()

            if (batch_id + 1) % log_interval == 0:
                toc = time.time()
                log.info('Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
                         'Time cost=%.1f Thoughput=%.2f samples/s',
                         epoch_id, batch_id, len(train_dataloader),
                         step_loss / log_interval,
                         trainer.learning_rate, toc - tic, log_num / (toc - tic))
                tic = time.time()
                step_loss = 0.0
                log_num = 0
        epoch_toc = time.time()
        log.info('Epoch: {}, Time cost={:.2f} s, Thoughput={:.2f} samples/s'