How to use the gluonnlp.utils.mkdir function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / test_utils.py View on Github external
def test_mkdir(dirname):
    nlp.utils.mkdir(dirname)
    assert os.path.isdir(os.path.expanduser(dirname))
github eric-haibin-lin / AMLC19-GluonNLP / 04_contextual_representation / bert / export / export.py View on Github external
type=int,
                    default=384,
                    help='The maximum total input sequence length after WordPiece tokenization.'
                         'Sequences longer than this needs to be truncated, and sequences shorter '
                         'than this needs to be padded. Default is 384')

parser.add_argument('--dropout',
                    type=float,
                    default=0.1,
                    help='The dropout probability for the classification/regression head.')

args = parser.parse_args()

# create output dir
output_dir = args.output_dir
nlp.utils.mkdir(output_dir)

###############################################################################
#                                Logging                                      #
###############################################################################

log = logging.getLogger('gluonnlp')
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
                              datefmt='%H:%M:%S')
fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
log.addHandler(console)
github dmlc / gluon-nlp / scripts / bert / export.py View on Github external
type=int,
                    default=384,
                    help='The maximum total input sequence length after WordPiece tokenization.'
                         'Sequences longer than this needs to be truncated, and sequences shorter '
                         'than this needs to be padded. Default is 384')

parser.add_argument('--dropout',
                    type=float,
                    default=0.1,
                    help='The dropout probability for the classification/regression head.')

args = parser.parse_args()

# create output dir
output_dir = args.output_dir
nlp.utils.mkdir(output_dir)

###############################################################################
#                                Logging                                      #
###############################################################################

log = logging.getLogger('gluonnlp')
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
                              datefmt='%H:%M:%S')
fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
log.addHandler(console)
github dmlc / gluon-nlp / scripts / language_model / run_glue.py View on Github external
# initialize classifier
if not args.model_parameters:
    model.classifier.initialize(init=initializer, ctx=ctxs)
    model.pooler.initialize(init=initializer, ctx=ctxs)

# load checkpointing
output_dir = args.output_dir

if args.model_parameters:
    logging.info('loading model params from %s', args.model_parameters)
    nlp.utils.load_parameters(model,
                              args.model_parameters,
                              ctx=ctxs,
                              cast_dtype=True)

nlp.utils.mkdir(output_dir)

logging.debug(model)
model.hybridize(static_alloc=True)
loss_function.hybridize(static_alloc=True)

logging.info('processing dataset...')
train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
    tokenizer, task, args.batch_size, args.dev_batch_size, args.max_len, vocab)


def test(loader_test, segment):
    """Inference function on the test dataset."""
    logging.info('Now we are doing testing on %s with %s.', segment, ctxs)

    tic = time.time()
    results = []
github eric-haibin-lin / AMLC19-GluonNLP / 05_deployment / bert / run_pretraining.py View on Github external
save_parameters(step_num, model, args.ckpt_dir)
    mx.nd.waitall()
    train_end_time = time.time()
    logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))

if __name__ == '__main__':
    ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
          [mx.gpu(int(x)) for x in args.gpus.split(',')]

    model, nsp_loss, mlm_loss, vocab = get_model_loss(ctx, args.model, args.pretrained,
                                                      args.dataset_name, None, args.dtype,
                                                      ckpt_dir=args.ckpt_dir,
                                                      start_step=args.start_step)

    store = mx.kv.create(args.kvstore)
    nlp.utils.mkdir(args.ckpt_dir)

    if args.data:
        logging.info('Using training data at {}'.format(args.data))
        num_parts = 1 if args.dummy_data_len else store.num_workers
        part_idx = 0 if args.dummy_data_len else store.rank
        data_train = get_pretrain_data_npz(args.data, args.batch_size, len(ctx), True,
                                           args.use_avg_len, args.num_buckets,
                                           num_parts=num_parts, part_idx=part_idx,
                                           prefetch=not args.dummy_data_len)
        train(data_train, model, nsp_loss, mlm_loss, len(vocab), ctx, store)
    if args.data_eval:
        logging.info('Using evaluation data at {}'.format(args.data_eval))
        data_eval = get_pretrain_data_npz(args.data_eval, args.batch_size_eval, len(ctx),
                                          False, False, 1)
        evaluate(data_eval, model, nsp_loss, mlm_loss, len(vocab), ctx,
                 args.log_interval, args.dtype)
github dmlc / gluon-nlp / scripts / bert / run_pretraining.py View on Github external
dataset_name, vocab, args.dtype,
                                  ckpt_dir=args.ckpt_dir,
                                  start_step=args.start_step)
    logging.info('Model created')
    data_eval = args.data_eval

    if args.raw:
        if args.sentencepiece:
            tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
                                                 lower=not args.cased)
        else:
            tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)

        cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
        cache_file = os.path.join(cache_dir, 'part-000.npz')
        nlp.utils.mkdir(cache_dir)

        # generate dev dataset from the raw text if needed
        if not args.eval_use_npz:
            data_eval = cache_file
            if not os.path.isfile(cache_file) and rank == 0:
                generate_dev_set(tokenizer, vocab, cache_file, args)

    logging.debug('Random seed set to %d', random_seed)
    mx.random.seed(random_seed)

    if args.data:
        if args.raw:
            get_dataset_fn = functools.partial(get_pretrain_data_text,
                                               max_seq_length=args.max_seq_length,
                                               short_seq_prob=args.short_seq_prob,
                                               masked_lm_prob=args.masked_lm_prob,
github dmlc / gluon-nlp / scripts / bert / run_pretraining_hvd.py View on Github external
ckpt_dir=args.ckpt_dir,
                                                      start_step=args.start_step)
    logging.debug('Model created')
    data_eval = args.data_eval

    if args.raw:
        if args.sentencepiece:
            tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
                                                 num_best=args.sp_nbest,
                                                 alpha=args.sp_alpha, lower=not args.cased)
        else:
            tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)

        cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
        cache_file = os.path.join(cache_dir, 'part-000.npz')
        nlp.utils.mkdir(cache_dir)

        # generate dev dataset from the raw text if needed
        if not args.eval_use_npz:
            data_eval = cache_file
            if not os.path.isfile(cache_file) and rank == 0:
                generate_dev_set(tokenizer, vocab, cache_file, args)

    logging.debug('Random seed set to %d', random_seed)
    mx.random.seed(random_seed)

    if args.data:
        if args.raw:
            get_dataset_fn = functools.partial(get_pretrain_data_text,
                                               max_seq_length=args.max_seq_length,
                                               short_seq_prob=args.short_seq_prob,
                                               masked_lm_prob=args.masked_lm_prob,
github dmlc / gluon-nlp / scripts / bert / finetune_classifier.py View on Github external
else:
    model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
# initialize classifier
if not model_parameters:
    model.classifier.initialize(init=initializer, ctx=ctx)

# load checkpointing
output_dir = args.output_dir
if pretrained_bert_parameters:
    logging.info('loading bert params from %s', pretrained_bert_parameters)
    nlp.utils.load_parameters(model.bert, pretrained_bert_parameters, ctx=ctx, ignore_extra=True,
                              cast_dtype=True)
if model_parameters:
    logging.info('loading model params from %s', model_parameters)
    nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
nlp.utils.mkdir(output_dir)

logging.debug(model)
model.hybridize(static_alloc=True)
loss_function.hybridize(static_alloc=True)

if deploy:
    logging.info('load symbol file directly as SymbolBlock for model deployment')
    model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
                                         ['data0', 'data1', 'data2'],
                                         '{}-0000.params'.format(args.model_prefix))
    model.hybridize(static_alloc=True, static_shape=True)

# data processing
do_lower_case = 'uncased' in dataset
if use_roberta:
    bert_tokenizer = nlp.data.GPT2BPETokenizer()
github eric-haibin-lin / AMLC19-GluonNLP / 04_contextual_representation / bert / create_pretraining_data.py View on Github external
def main():
    """Main function."""
    time_start = time.time()

    # random seed
    random.seed(args.random_seed)

    # create output dir
    output_dir = os.path.expanduser(args.output_dir)
    nlp.utils.mkdir(output_dir)

    # vocabulary and tokenizer
    if args.sentencepiece:
        logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
        if args.dataset_name:
            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
                          'The vocabulary will be loaded based on --sentencepiece.')
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
        tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest,
                                             alpha=args.sp_alpha, lower=not args.cased)
    else:
        logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name)
        vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir,
                                                      cls=nlp.vocab.BERTVocab)
        tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name)
github dmlc / gluon-nlp / scripts / bert / data / create_pretraining_data.py View on Github external
def main():
    """Main function."""
    time_start = time.time()

    # random seed
    random.seed(args.random_seed)

    # create output dir
    output_dir = os.path.expanduser(args.output_dir)
    nlp.utils.mkdir(output_dir)

    # vocabulary and tokenizer
    if args.sentencepiece:
        logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
        if args.dataset_name:
            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
                          'The vocabulary will be loaded based on --sentencepiece.')
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
        tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest,
                                             alpha=args.sp_alpha, lower=not args.cased)
    else:
        logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name)
        vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir,
                                                      cls=nlp.vocab.BERTVocab)
        tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name)