Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_gbw():
batch_size = 80
seq_len = 35
stream = nlp.data.GBWStream(segment='test')
freq = nlp.data.utils.Counter(
itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
assert len(freq) == 21545
assert sum(c for c in freq.values()) == 159658
assert freq['English'] == 14
if args.dataset_name:
warnings.warn('Both --dataset_name and --sentencepiece are provided. '
'The vocabulary will be loaded based on --sentencepiece')
dataset_name = None
vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
model, nsp_loss, mlm_loss, vocab = get_model_loss([ctx], args.model, args.pretrained,
dataset_name, vocab, args.dtype,
ckpt_dir=args.ckpt_dir,
start_step=args.start_step)
logging.debug('Model created')
data_eval = args.data_eval
if args.raw:
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
num_best=args.sp_nbest,
alpha=args.sp_alpha, lower=not args.cased)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)
cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
cache_file = os.path.join(cache_dir, 'part-000.npz')
nlp.utils.mkdir(cache_dir)
# generate dev dataset from the raw text if needed
if not args.eval_use_npz:
data_eval = cache_file
if not os.path.isfile(cache_file) and rank == 0:
generate_dev_set(tokenizer, vocab, cache_file, args)
logging.debug('Random seed set to %d', random_seed)
def add_parameters(parser):
"""Add evaluation specific parameters to parser."""
group = parser.add_argument_group('Evaluation arguments')
group.add_argument('--eval-batch-size', type=int, default=512)
# Datasets
group.add_argument(
'--similarity-datasets', type=str,
default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
nargs='*',
help='Word similarity datasets to use for intrinsic evaluation.')
group.add_argument(
'--similarity-functions', type=str,
default=nlp.embedding.evaluation.list_evaluation_functions(
'similarity'), nargs='+',
help='Word similarity functions to use for intrinsic evaluation.')
group.add_argument(
'--analogy-datasets', type=str, default=['GoogleAnalogyTestSet'],
nargs='*',
help='Word similarity datasets to use for intrinsic evaluation.')
group.add_argument(
'--analogy-functions', type=str,
default=nlp.embedding.evaluation.list_evaluation_functions('analogy'),
nargs='+',
help='Word analogy functions to use for intrinsic evaluation. ')
def __call__(self, example):
"""Maps examples into distinct tokens
Parameters
----------
example : dict
Example to process with context_tokens and ques_tokens keys
Returns
-------
mapped_values : List[Tuple]
Result of mapping process. Each tuple of (token, count) format
"""
para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
else [c for tkn in example['context_tokens'] for c in tkn])
ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
else [c for tkn in example['ques_tokens'] for c in tkn])
counter = para_counter + ques_counter
return list(counter.items())
def register_vocab(dataset, sha1):
if dataset not in nlp.data.utils._vocab_sha1:
nlp.data.utils._vocab_sha1[dataset] = sha1
if isinstance(dataset, nlp.data.NumpyDataset):
lengths = dataset.get_field('valid_lengths')
elif isinstance(dataset, BERTPretrainDataset):
lengths = dataset.transform(lambda input_ids, segment_ids, masked_lm_positions, \
masked_lm_ids, masked_lm_weights, \
next_sentence_labels, valid_lengths: \
valid_lengths, lazy=False)
else:
raise ValueError('unexpected dataset type: %s'%str(dataset))
# A batch includes: input_id, masked_id, masked_position, masked_weight,
# next_sentence_label, segment_id, valid_length
batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack())
if self._use_avg_len:
# sharded data loader
sampler = nlp.data.FixedBucketSampler(lengths=lengths,
# batch_size per shard
batch_size=self._batch_size,
num_buckets=self._num_buckets,
shuffle=self._shuffle,
use_average_length=True,
num_shards=self._num_ctxes)
dataloader = nlp.data.ShardedDataLoader(dataset,
batch_sampler=sampler,
batchify_fn=batchify_fn,
num_workers=self._num_ctxes)
else:
sampler = nlp.data.FixedBucketSampler(lengths,
batch_size=self._batch_size * self._num_ctxes,
num_buckets=self._num_buckets,
ratio=0,
shuffle=self._shuffle)
', '.join(
nlp.embedding.list_sources().keys()))))
group.add_argument('--embedding-source', type=str, default='wiki.simple',
help=('Source from which to initialize the embedding.'
'Pass --list-embedding-sources to get a list of '
'valid sources for a given --embedding-name.'))
group.add_argument('--list-embedding-sources', action='store_true')
# Evaluation arguments
group = parser.add_argument_group('Evaluation arguments')
group.add_argument('--ignore-oov', action='store_true',
help='Drop OOV words from evaluation datasets.')
## Datasets
group.add_argument(
'--similarity-datasets', type=str,
default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
nargs='*',
help='Word similarity datasets to use for intrinsic evaluation.')
group.add_argument(
'--similarity-functions', type=str,
default=nlp.embedding.evaluation.list_evaluation_functions(
'similarity'), nargs='+',
help='Word similarity functions to use for intrinsic evaluation.')
group.add_argument(
'--analogy-datasets', type=str,
default=nlp.data.word_embedding_evaluation.word_analogy_datasets,
nargs='*',
help='Word similarity datasets to use for intrinsic evaluation.')
group.add_argument(
'--analogy-functions', type=str,
default=nlp.embedding.evaluation.list_evaluation_functions('analogy'),
nargs='+',
from transformer.model import get_model
with open(args.vocab_file, 'r') as f:
vocab = nlp.Vocab.from_json(f.read())
ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
model, vocab = get_model('transformerxl', vocab=vocab, dataset_name=args.dataset,
clamp_len=args.clamp_len)
model.initialize(ctx=ctx)
model.load_parameters(args.parameter_file, ignore_extra=False)
model.hybridize()
print(model)
# Data
if args.dataset == 'wt103':
val_dataset, test_dataset = [
nlp.data.WikiText103(segment=segment, skip_empty=False, bos=vocab.bos_token,
eos=vocab.eos_token) for segment in ['val', 'test']
]
elif args.dataset == 'lm1b':
# bos=vocab.eos_token is not a typo: tf uses ['<s>'] + symbols + ['<s>']
test_datasets = list(
nlp.data.GBWStream(segment='test', skip_empty=True, bos=vocab.eos_token,
eos=vocab.eos_token))
assert len(test_datasets) == 1
test_dataset = mx.gluon.data.SimpleDataset(
list(itertools.chain.from_iterable(test_datasets[0])))
val_dataset = None
elif args.dataset == 'text8':
dataset = nlp.data.Text8(max_sentence_length=None)
chars = list(itertools.chain.from_iterable(list(w) + ['_'] for w in dataset[0]))
num_test_chars = 5000000
val_dataset = mx.gluon.data.SimpleDataset(chars[-2 * num_test_chars:-num_test_chars])</s></s>
def get_tokenizer(lm_model):
if lm_model.startswith('gpt2'):
return nlp.data.GPT2BPETokenizer(), nlp.data.GPT2BPEDetokenizer()
else:
return nlp.data.SacreMosesTokenizer(), nlp.data.SacreMosesDetokenizer(return_str=True)