Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
assert v9.unknown_token == ''
assert v9.reserved_tokens == ['b', 'a']
assert v9.embedding is None
assert 'a' in v9
v10 = nlp.Vocab(counter, max_size=None, min_freq=100, unknown_token='',
padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b', 'c'])
assert len(v10) == 3
assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2}
assert v10.idx_to_token[1] == 'b'
assert v10.unknown_token == ''
assert v10.reserved_tokens == ['b', 'c']
assert v10.embedding is None
assert 'a' not in v10
v11 = nlp.Vocab(counter, max_size=1, min_freq=2, unknown_token='',
padding_token=None, bos_token=None, eos_token=None,
reserved_tokens=['', 'b'])
assert len(v11) == 4
assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3}
assert v11.idx_to_token[1] == ''
assert v11.unknown_token == ''
assert v11.reserved_tokens == ['', 'b']
assert v11.embedding is None
assert 'a' not in v11
v12 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='b',
padding_token=None, bos_token=None, eos_token=None, reserved_tokens=[''])
assert len(v12) == 3
assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2}
assert v12.idx_to_token[1] == ''
assert v12.unknown_token == 'b'
assert v3.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
assert v3.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(),
np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.06, 0.07, 0.08, 0.09, 0.1],
[0.6, 0.7, 0.8, 0.9, 1,
0.11, 0.12, 0.13, 0.14, 0.15],
[0.1, 0.2, 0.3, 0.4, 0.5,
0.01, 0.02, 0.03, 0.04, 0.05],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15]])
)
v4 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='', padding_token=None,
bos_token=None, eos_token=None, reserved_tokens=None)
v4.set_embedding(my_embed3, my_embed4)
assert v4.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
assert v4.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(),
np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.06, 0.07, 0.08, 0.09, 0.1],
[0.6, 0.7, 0.8, 0.9, 1,
0.11, 0.12, 0.13, 0.14, 0.15],
[0.1, 0.2, 0.3, 0.4, 0.5,
0.01, 0.02, 0.03, 0.04, 0.05],
[1.1, 1.2, 1.3, 1.4, 1.5,
0.11, 0.12, 0.13, 0.14, 0.15]])
)
def test_big_text_models(wikitext2_val_and_counter):
# use a small vocabulary for testing
val, val_freq = wikitext2_val_and_counter
vocab = nlp.Vocab(val_freq)
text_models = ['big_rnn_lm_2048_512']
for model_name in text_models:
eprint('testing forward for %s' % model_name)
model, _ = nlp.model.get_model(model_name, vocab=vocab)
print(model)
model.collect_params().initialize()
batch_size = 10
hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros)
output, state = model(mx.nd.arange(330).reshape((33, 10)), hidden)
output.wait_to_read()
def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset):
all_token = []
max_len = 0
for dataset in (train_dataset, dev_dataset, test_dataset):
for line in dataset:
line = _clean_str(line[0], data_name).split()
max_len = max_len if max_len > len(line) else len(line)
all_token.extend(line)
vocab = nlp.Vocab(nlp.data.count_tokens(all_token))
vocab.set_embedding(nlp.embedding.create('Word2Vec', source='GoogleNews-vectors-negative300'))
for word in vocab.embedding._idx_to_token:
if (vocab.embedding[word] == nd.zeros(300)).sum() == 300:
vocab.embedding[word] = nd.random.uniform(0, 0.05, 300)
vocab.embedding[''] = nd.random.uniform(0, 0.05, 300)
vocab.embedding[''] = nd.zeros(300)
vocab.embedding[''] = nd.zeros(300)
vocab.embedding[''] = nd.zeros(300)
print('maximum length (in tokens): ', max_len)
return vocab, max_len
word_vocab.set_embedding(embed)
label_vectors = []
for id_ in range(len(label2id.keys())):
label = id2label[id_]
label_words = label.split('_')
label_vectors.append(word_vocab.embedding[label_words].asnumpy().sum(0))
affinity = cosine_similarity(label_vectors)
else:
print("BOW features for ontonotes")
words = []
for label in label2id.keys():
label = label.replace('/', ' ')
labels = label.strip().split()
words += labels
word_counter = gluonnlp.data.count_tokens(words)
word_vocab = gluonnlp.Vocab(word_counter)
embed = gluonnlp.embedding.create(emb_name, source=emb_source)
word_vocab.set_embedding(embed)
label_list = []
label_vectors = []
for id_ in range(len(label2id.keys())):
label = id2label[id_]
label = label.replace('/', ' ')
labels = label.strip().split()
label_list.append(labels)
label_vectors.append(word_vocab.embedding[labels].asnumpy().sum(0))
label_vectors = np.array(label_vectors)
affinity = cosine_similarity(label_vectors)
matrix = np.zeros((len(label2id.keys()), len(label2id.keys())))
if goal == 'onto':
def build_vocab(file_list = ['crowd/dev.json', 'crowd/train_m.json', 'crowd/test.json', 'ontonotes/augmented_train.json', 'ontonotes/g_dev.json', 'ontonotes/g_test.json', 'distant_supervision/headword_train.json', 'distant_supervision/headword_dev.json', 'distant_supervision/el_dev.json', 'distant_supervision/el_train.json']):
data_path = "data/release/"
words = []
for file in file_list:
file_name = data_path + file
with open(file_name) as f:
line_elems = [json.loads(sent.strip()) for sent in f.readlines()]
mention_seq = [line_elem["mention_span"].split() for line_elem in line_elems]
left_seq = [line_elem['left_context_token'] for line_elem in line_elems]
right_seq = [line_elem['right_context_token'] for line_elem in line_elems]
for _ in mention_seq + right_seq + left_seq:
words += [tok.lower() for tok in _]
counter = gluonnlp.data.count_tokens(words)
vocab = gluonnlp.Vocab(counter)
with open('data/release/idx_to_token', 'w') as g:
g.write('\n'.join(vocab.idx_to_token))
with open('data/release/token_to_idx.json', 'w') as g:
json.dump(vocab.token_to_idx, g)
self.text_vocab = text_vocab
self.seq_len = seq_len
self.bert_tokenizer = nlp.data.BERTTokenizer(vocab=text_vocab, lower=not is_cased)
train_sentences = [] if train_path is None else load_segment(train_path,
self.bert_tokenizer)
dev_sentences = [] if dev_path is None else load_segment(dev_path, self.bert_tokenizer)
test_sentences = [] if test_path is None else load_segment(test_path, self.bert_tokenizer)
all_sentences = train_sentences + dev_sentences + test_sentences
if tag_vocab is None:
logging.info('Indexing tags...')
tag_counter = nlp.data.count_tokens(token.tag
for sentence in all_sentences for token in sentence)
self.tag_vocab = nlp.Vocab(tag_counter, padding_token=NULL_TAG,
bos_token=None, eos_token=None, unknown_token=None)
else:
self.tag_vocab = tag_vocab
self.null_tag_index = self.tag_vocab[NULL_TAG]
if len(test_sentences) > 0:
logging.info('example test sentences:')
for i in range(2):
logging.info(str(test_sentences[i]))
self.train_inputs = [self._encode_as_input(sentence) for sentence in train_sentences]
self.dev_inputs = [self._encode_as_input(sentence) for sentence in dev_sentences]
self.test_inputs = [self._encode_as_input(sentence) for sentence in test_sentences]
logging.info('tag_vocab: %s', self.tag_vocab)
def _create_squad_vocab(all_tokens):
"""Provides vocabulary based on list of tokens
Parameters
----------
all_tokens: List[str]
List of all tokens
Returns
-------
Vocab
Vocabulary
"""
counter = data.count_tokens(all_tokens)
vocab = Vocab(counter)
return vocab