Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_gbw():
batch_size = 80
seq_len = 35
stream = nlp.data.GBWStream(segment='test')
freq = nlp.data.utils.Counter(
itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
assert len(freq) == 21545
assert sum(c for c in freq.values()) == 159658
assert freq['English'] == 14
assert v9.unknown_token == ''
assert v9.reserved_tokens == ['b', 'a']
assert v9.embedding is None
assert 'a' in v9
v10 = nlp.Vocab(counter, max_size=None, min_freq=100, unknown_token='',
padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b', 'c'])
assert len(v10) == 3
assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2}
assert v10.idx_to_token[1] == 'b'
assert v10.unknown_token == ''
assert v10.reserved_tokens == ['b', 'c']
assert v10.embedding is None
assert 'a' not in v10
v11 = nlp.Vocab(counter, max_size=1, min_freq=2, unknown_token='',
padding_token=None, bos_token=None, eos_token=None,
reserved_tokens=['', 'b'])
assert len(v11) == 4
assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3}
assert v11.idx_to_token[1] == ''
assert v11.unknown_token == ''
assert v11.reserved_tokens == ['', 'b']
assert v11.embedding is None
assert 'a' not in v11
v12 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='b',
padding_token=None, bos_token=None, eos_token=None, reserved_tokens=[''])
assert len(v12) == 3
assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2}
assert v12.idx_to_token[1] == ''
assert v12.unknown_token == 'b'
assert v3.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
assert v3.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(),
np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.06, 0.07, 0.08, 0.09, 0.1],
[0.6, 0.7, 0.8, 0.9, 1,
0.11, 0.12, 0.13, 0.14, 0.15],
[0.1, 0.2, 0.3, 0.4, 0.5,
0.01, 0.02, 0.03, 0.04, 0.05],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15]])
)
v4 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='', padding_token=None,
bos_token=None, eos_token=None, reserved_tokens=None)
v4.set_embedding(my_embed3, my_embed4)
assert v4.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
assert v4.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(),
np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.06, 0.07, 0.08, 0.09, 0.1],
[0.6, 0.7, 0.8, 0.9, 1,
0.11, 0.12, 0.13, 0.14, 0.15],
[0.1, 0.2, 0.3, 0.4, 0.5,
0.01, 0.02, 0.03, 0.04, 0.05],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15]])
)
def test_big_text_models(wikitext2_val_and_counter):
# use a small vocabulary for testing
val, val_freq = wikitext2_val_and_counter
vocab = nlp.Vocab(val_freq)
text_models = ['big_rnn_lm_2048_512']
for model_name in text_models:
eprint('testing forward for %s' % model_name)
model, _ = nlp.model.get_model(model_name, vocab=vocab)
print(model)
model.collect_params().initialize()
batch_size = 10
hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros)
output, state = model(mx.nd.arange(330).reshape((33, 10)), hidden)
output.wait_to_read()
for pad_index in [[0], [1], [2], [0, 1], [1, 2], [0, 1, 2]]:
shapes = [[[2 for _ in range(ndim)] for _ in range(batch_size)]
for _ in range(TOTAL_ELE_NUM)]
for j in pad_index:
for i in range(batch_size):
shapes[j][i][axis] = np.random.randint(length_min, length_max)
random_data_npy = [tuple(np.random.normal(0, 1, shapes[j][i]).astype(dtype)
for j in range(TOTAL_ELE_NUM)) for i in range(batch_size)]
batchify_fn = []
for j in range(TOTAL_ELE_NUM):
if j in pad_index:
batchify_fn.append(batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True,
dtype=_dtype))
else:
batchify_fn.append(batchify.Stack(dtype=_dtype))
batchify_fn = batchify.Tuple(batchify_fn)
ret_use_npy = batchify_fn(random_data_npy)
with pytest.warns(UserWarning):
# Using Pad with NDArrays is discouraged for speed reasons.
ret_use_mx = batchify_fn([tuple(mx.nd.array(ele[i], dtype=dtype)
for i in range(TOTAL_ELE_NUM))
for ele in random_data_npy])
for i in range(TOTAL_ELE_NUM):
if i in pad_index:
assert ret_use_npy[i][0].dtype == ret_use_mx[i][0].dtype == dtype
assert ret_use_npy[i][1].dtype == ret_use_mx[i][1].dtype == np.int32
assert_allclose(ret_use_npy[i][0].asnumpy(),
ret_use_mx[i][0].asnumpy())
assert_allclose(ret_use_npy[i][1].asnumpy(),
ret_use_mx[i][1].asnumpy())
assert (ret_use_npy[i][1].shape == (batch_size,))
else:
def test_bertvocab():
ctx = mx.cpu()
bert_base1, vocab1 = nlp.model.get_model('bert_12_768_12',
dataset_name='book_corpus_wiki_en_cased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base2, vocab2 = nlp.model.get_model('bert_12_768_12',
dataset_name='book_corpus_wiki_en_uncased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base3, vocab3 = nlp.model.get_model('bert_12_768_12',
dataset_name='wiki_multilingual_cased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base4, vocab4 = nlp.model.get_model('bert_12_768_12',
dataset_name='wiki_multilingual_uncased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base5, vocab5 = nlp.model.get_model('bert_12_768_12',
dataset_name='wiki_cn_cased',
pretrained=True, ctx=ctx, use_pooler=True,
use_decoder=False, use_classifier=False)
bert_base6, vocab6 = nlp.model.get_model('bert_12_768_12',
dataset_name='kobert_news_wiki_ko_cased',
def test_get_elmo_models():
model_names = ['elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
'elmo_2x4096_512_2048cnn_2xhighway', 'elmo_2x4096_512_2048cnn_2xhighway']
datasets = ['gbw', 'gbw', 'gbw', '5bw']
for model_name, dataset in zip(model_names, datasets):
print('testing forward for %s on dataset %s' % (model_name, dataset))
model, _ = nlp.model.get_model(model_name,
dataset_name=dataset,
pretrained=dataset is not None,
root='tests/data/model/')
print(model)
if not dataset:
model.collect_params().initialize()
begin_state = model.begin_state(mx.nd.zeros, batch_size=20)
output, state = model(mx.nd.arange(35000).reshape(20, 35, 50), begin_state)
del model
mx.nd.waitall()
if not has_missing_params:
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True)
else:
with pytest.raises(AssertionError):
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True)
if not disable_missing_parameters:
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True,
pretrained_allow_missing=True)
elif 'biobert' in dataset:
# Biobert specific test case
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True,
pretrained_allow_missing=True,
use_decoder=False,
use_classifier=False)
elif 'clinicalbert' in dataset:
# Clinicalbert specific test case
model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
pretrained=True,
pretrained_allow_missing=True,
use_decoder=False)
else:
assert False, "Testcase needs to be adapted."
assert len(vocab) == vocab_size[dataset]
for token in special_tokens:
assert token in vocab, "Token %s not found in the vocab" % token
def test_word_embedding_analogy_evaluation_models(analogy_function):
dataset = nlp.data.GoogleAnalogyTestSet()
dataset = [d for i, d in enumerate(dataset) if i < 10]
embedding = nlp.embedding.create('fasttext', source='wiki.simple')
counter = nlp.data.utils.Counter(embedding.idx_to_token)
vocab = nlp.vocab.Vocab(counter)
vocab.set_embedding(embedding)
dataset_coded = [[vocab[d[0]], vocab[d[1]], vocab[d[2]], vocab[d[3]]]
for d in dataset]
dataset_coded_nd = nd.array(dataset_coded, dtype=np.int64)
for k in [1, 3]:
for exclude_question_words in [True, False]:
evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
idx_to_vec=vocab.embedding.idx_to_vec,
analogy_function=analogy_function, k=k,
exclude_question_words=exclude_question_words)
evaluator.initialize()
def test_token_embedding_from_file_S3_with_custom_unknown_token(unknown_token):
nlp.embedding.create('glove', source='glove.6B.50d',
unknown_token=unknown_token)