Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_bertvocab():
ctx = mx.cpu()
bert_base1, vocab1 = nlp.model.get_model('bert_12_768_12',
dataset_name='book_corpus_wiki_en_cased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base2, vocab2 = nlp.model.get_model('bert_12_768_12',
dataset_name='book_corpus_wiki_en_uncased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base3, vocab3 = nlp.model.get_model('bert_12_768_12',
dataset_name='wiki_multilingual_cased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base4, vocab4 = nlp.model.get_model('bert_12_768_12',
dataset_name='wiki_multilingual_uncased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base5, vocab5 = nlp.model.get_model('bert_12_768_12',
dataset_name='wiki_cn_cased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base6, vocab6 = nlp.model.get_model('bert_12_768_12',
dataset_name='kobert_news_wiki_ko_cased',
def test_get_elmo_models():
model_names = ['elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
'elmo_2x4096_512_2048cnn_2xhighway', 'elmo_2x4096_512_2048cnn_2xhighway']
datasets = ['gbw', 'gbw', 'gbw', '5bw']
for model_name, dataset in zip(model_names, datasets):
print('testing forward for %s on dataset %s' % (model_name, dataset))
model, _ = nlp.model.get_model(model_name,
dataset_name=dataset,
pretrained=dataset is not None,
root='tests/data/model/')
print(model)
if not dataset:
model.collect_params().initialize()
begin_state = model.begin_state(mx.nd.zeros, batch_size=20)
output, state = model(mx.nd.arange(35000).reshape(20, 35, 50), begin_state)
del model
mx.nd.waitall()
if not has_missing_params:
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True)
else:
with pytest.raises(AssertionError):
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True)
if not disable_missing_parameters:
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True,
pretrained_allow_missing=True)
elif 'biobert' in dataset:
# Biobert specific test case
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True,
pretrained_allow_missing=True,
use_decoder=False,
use_classifier=False)
elif 'clinicalbert' in dataset:
# Clinicalbert specific test case
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True,
pretrained_allow_missing=True,
use_decoder=False)
else:
assert False, "Testcase needs to be adapted."
assert len(vocab) == vocab_size[dataset]
for token in special_tokens:
assert token in vocab, "Token %s not found in the vocab" % token
'reward_attr': 'accuracy',
'dist_ip_addrs': dist_ip_addrs,
'searcher': search_strategy,
'search_options': search_options,
}
if search_strategy == 'hyperband':
scheduler_options.update({
'searcher': 'random',
'max_t': epochs,
'grace_period': grace_period if grace_period else epochs//4})
results = BaseTask.run_fit(train_text_classification, search_strategy,
scheduler_options)
args = sample_config(train_text_classification.args, results['best_config'])
get_model_params = results.pop('get_model_args')
get_model_params['ctx'] = mx.cpu(0)
bert, _ = nlp.model.get_model(**get_model_params)
model = get_network(bert, results.pop('class_labels'), 'roberta' in args.net)
update_params(model, results.pop('model_params'))
transform = results.pop('transform')
test_transform = results.pop('test_transform')
return TextClassificationPredictor(model, transform, test_transform, results, checkpoint, args)
raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
'(%d) + 3' % (max_seq_length, max_query_length))
# vocabulary and tokenizer
if args.sentencepiece:
logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
if dataset_name:
warnings.warn('Both --dataset_name and --sentencepiece are provided. '
'The vocabulary will be loaded based on --sentencepiece.')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
dataset_name = None
else:
vocab = None
pretrained = not model_parameters and not pretrained_bert_parameters and not args.sentencepiece
bert, vocab = nlp.model.get_model(
name=model_name,
dataset_name=dataset_name,
vocab=vocab,
pretrained=pretrained,
ctx=ctx,
use_pooler=False,
use_decoder=False,
use_classifier=False)
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack(),
vocab=vocabulary,
pretrained=not args.gluon_parameter_file,
use_pooler=False,
use_decoder=False,
use_classifier=False)
try:
bert.cast('float16')
bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
bert.cast('float32')
except AssertionError:
bert.cast('float32')
bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
else:
assert not args.gluon_vocab_file, \
'Cannot specify --gluon_vocab_file without specifying --gluon_parameter_file'
bert, vocabulary = nlp.model.get_model(args.gluon_model,
dataset_name=args.gluon_dataset,
pretrained=not args.gluon_parameter_file,
use_pooler=False,
use_decoder=False,
use_classifier=False)
print(bert)
tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter(' ||| '))
trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
dataset = dataset.transform(trans)
bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
shuffle=True, last_batch='rollover')
dtype : float
Data type of the model for training.
ckpt_dir : str
The path to the checkpoint directory.
start_step : int or None
If provided, it loads the model from the corresponding checkpoint from the ckpt_dir.
Returns
-------
BERTModel : the model for pre-training.
Loss : the next sentence prediction loss.
Loss : the masked langauge model loss.
BERTVocab : the vocabulary.
"""
# model
model, vocabulary = nlp.model.get_model(model, dataset_name=dataset_name, vocab=vocab,
pretrained=pretrained, ctx=ctx)
if not pretrained:
model.initialize(init=mx.init.Normal(0.02), ctx=ctx)
model.cast(dtype)
if ckpt_dir and start_step:
param_path = os.path.join(ckpt_dir, '%07d.params'%start_step)
nlp.utils.load_parameters(model, param_path, ctx=ctx)
logging.info('Loading step %d checkpoints from %s.', start_step, param_path)
model.hybridize(static_alloc=True)
# losses
nsp_loss = mx.gluon.loss.SoftmaxCELoss()
mlm_loss = mx.gluon.loss.SoftmaxCELoss()
raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
'(%d) + 3' % (max_seq_length, max_query_length))
# vocabulary and tokenizer
if args.sentencepiece:
logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
if dataset_name:
warnings.warn('Both --dataset_name and --sentencepiece are provided. '
'The vocabulary will be loaded based on --sentencepiece.')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
dataset_name = None
else:
vocab = None
pretrained = not model_parameters and not pretrained_bert_parameters and not args.sentencepiece
bert, vocab = nlp.model.get_model(
name=model_name,
dataset_name=dataset_name,
vocab=vocab,
pretrained=pretrained,
ctx=ctx,
use_pooler=False,
use_decoder=False,
use_classifier=False)
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack(),
dtype : float
Data type of the model for training.
ckpt_dir : str
The path to the checkpoint directory.
start_step : int or None
If provided, it loads the model from the corresponding checkpoint from the ckpt_dir.
Returns
-------
BERTModel : the model for pre-training.
Loss : the next sentence prediction loss.
Loss : the masked langauge model loss.
BERTVocab : the vocabulary.
"""
# model
model, vocabulary = nlp.model.get_model(model, dataset_name=dataset_name, vocab=vocab,
pretrained=pretrained, ctx=ctx)
if not pretrained:
model.initialize(init=mx.init.Normal(0.02), ctx=ctx)
model.cast(dtype)
if ckpt_dir and start_step:
param_path = os.path.join(ckpt_dir, '%07d.params'%start_step)
nlp.utils.load_parameters(model, param_path, ctx=ctx)
logging.info('Loading step %d checkpoints from %s.', start_step, param_path)
model.hybridize(static_alloc=True)
# losses
nsp_loss = mx.gluon.loss.SoftmaxCELoss()
mlm_loss = mx.gluon.loss.SoftmaxCELoss()
pretrained : bool, default False
Whether to load the pre-trained weights for model.
ctx : Context, default CPU
The context in which to load the pre-trained weights.
root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
Location for keeping the model parameters.
Returns
-------
gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab
"""
models = {'gpt2_117m' : gpt2_117m,
'gpt2_345m' : gpt2_345m}
name = name.lower()
if name not in models:
return _get_model(name, **kwargs)
return models[name](**kwargs)