Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_word2vec_embedding_load_binary_format():
test_dir = os.path.dirname(os.path.realpath(__file__))
with pytest.warns(UserWarning): # UserWarning: skipped likely header line
word2vec_vec = nlp.embedding.Word2Vec.from_file(
os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum_w2v.vec'), elem_delim=' ')
word2vec_bin = nlp.embedding.Word2Vec.from_w2v_binary(
os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum_w2v.bin')
)
idx_to_vec = word2vec_bin[word2vec_vec.idx_to_token]
assert np.all(
np.isclose(a=word2vec_vec.idx_to_vec.asnumpy(),
b=idx_to_vec.asnumpy(), atol=0.001))
assert all(token in word2vec_bin for token in word2vec_vec.idx_to_token)
def test_serialization(emb, tmp_path=tmp_path):
emb_path = os.path.join(str(tmp_path), "emb.npz")
if unknown_lookup:
with pytest.warns(UserWarning): # UserWarning: Serialization of `unknown_lookup` is not supported
emb.serialize(emb_path)
else:
emb.serialize(emb_path)
loaded_emb = nlp.embedding.TokenEmbedding.deserialize(emb_path)
assert loaded_emb == emb
except StopIteration:
print('parsing is done')
label_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[1], dataset)))
tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None)
label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None)
with open('./data/label_vocab.pkl', mode='wb') as io:
pickle.dump(label_vocab, io)
tr, val = train_test_split(dataset, test_size=.1, random_state=777)
token_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[0], tr)))
tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_token_vocab.set_embedding(ptr_embedding)
token_vocab = Vocab(tmp_token_vocab.idx_to_token)
token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy()
with open('./data/token_vocab.pkl', mode='wb') as io:
pickle.dump(token_vocab, io)
with open('./data/train.pkl', mode='wb') as io:
pickle.dump(tr, io)
with open('./data/validation.pkl', mode='wb') as io:
pickle.dump(val, io)
def validate_args(args):
"""Validate provided arguments and act on --help."""
if args.list_embedding_sources:
print('Listing all sources for {} embeddings.'.format(
args.embedding_name))
print('Specify --embedding-name if you wish to '
'list sources of other embeddings')
print('')
if args.embedding_name not in nlp.embedding.list_sources().keys():
print('Invalid embedding name.')
print('Only {} are supported.'.format(', '.join(
nlp.embedding.list_sources().keys())))
sys.exit(1)
print(' '.join(nlp.embedding.list_sources()[args.embedding_name]))
sys.exit(0)
if not (args.embedding_path or args.embedding_name):
print('You must specify either --embedding-path or --embedding-name ')
print('Use --embedding-path to load and evaluate '
'word embeddings from a Word2Vec text format '
'or fastText binary format file')
print('Use --embedding-name or to download one of '
'the pre-trained embedding files included in GluonNLP.')
sys.exit(1)
if args.embedding_name and not args.embedding_source:
print('Please also specify --embedding-source'
' to select the version of the pre-trained embedding. '
'Use --list-embedding-sources to see all available sources')
sys.exit(1)
# Analogy task is open-vocabulary, so must keep all known words.
# But if not evaluating analogy, no need to precompute now as all
# words for closed vocabulary task can be obtained via the unknown
# lookup
if not args.analogy_datasets:
idx_to_token = []
elif args.analogy_datasets and args.analogy_max_vocab_size:
idx_to_token = idx_to_token[:args.analogy_max_vocab_size]
embedding[''] = mx.nd.zeros(model.weight.shape[1])
if idx_to_token:
with utils.print_time('compute vectors for {} known '
'words.'.format(len(idx_to_token))):
embedding[idx_to_token] = model[idx_to_token]
else:
embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)
return embedding
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()
vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab_ko.embedding = array
vocab_ko_filepath = sample_dir / "vocab_ko.pkl"
config.update({"source_vocab": str(vocab_ko_filepath)})
with open(vocab_ko_filepath, mode='wb') as io:
pickle.dump(vocab_ko, io)
# english vocab
split_en = Stemmer(language='en')
count_en = Counter(itertools.chain.from_iterable(tr['en'].apply(split_en.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_en)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple', load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()
vocab_en = Vocab(tmp_vocab.idx_to_token)
vocab_en.embedding = array
vocab_en_filepath = sample_dir / "vocab_en.pkl"
config.update({"target_vocab": str(vocab_en_filepath)})
with open(vocab_en_filepath, mode='wb') as io:
pickle.dump(vocab_en, io)
config.save("conf/dataset/sample.json")
def get_pret_embs(self, word_dims=None):
"""Read pre-trained embedding file
Parameters
----------
word_dims : int or None
vector size. Use `None` for auto-infer
Returns
-------
numpy.ndarray
T x C numpy NDArray
"""
assert (self._pret_embeddings is not None), "No pretrained file provided."
pret_embeddings = gluonnlp.embedding.create(self._pret_embeddings[0], source=self._pret_embeddings[1])
embs = [None] * len(self._id2word)
for idx, vec in enumerate(pret_embeddings.idx_to_vec):
embs[idx] = vec.asnumpy()
if word_dims is None:
word_dims = len(pret_embeddings.idx_to_vec[0])
for idx, emb in enumerate(embs):
if emb is None:
embs[idx] = np.zeros(word_dims)
pret_embs = np.array(embs, dtype=np.float32)
return pret_embs / np.std(pret_embs)
def evaluate_analogy(args, token_embedding, ctx, logfile=None, global_step=0):
"""Evaluate on specified analogy datasets.
The analogy task is an open vocabulary task, make sure to pass a
token_embedding with a sufficiently large number of supported tokens.
"""
results = []
exclude_question_words = not args.analogy_dont_exclude_question_words
for analogy_function in args.analogy_functions:
evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
idx_to_vec=token_embedding.idx_to_vec,
exclude_question_words=exclude_question_words,
analogy_function=analogy_function)
evaluator.initialize(ctx=ctx)
if not args.no_hybridize:
evaluator.hybridize()
for (dataset_name, dataset_kwargs,
dataset) in iterate_analogy_datasets(args):
initial_length = len(dataset)
dataset_coded = [[
token_embedding.token_to_idx[d[0]],
token_embedding.token_to_idx[d[1]],
token_embedding.token_to_idx[d[2]],
token_embedding.token_to_idx[d[3]]
] for d in dataset if d[0] in token_embedding.token_to_idx
tst_data = pd.read_csv(tst_filepath, sep='\t').loc[:, ['document', 'label']]
tst_data = tst_data.loc[tst_data['document'].isna().apply(lambda elm: not elm), :]
# extracting morph in sentences
tokenizer = MeCab()
tokenized = tr_data['document'].apply(tokenizer.morphs)
plt.hist(list(map(lambda elm: len(elm), tokenized)))
plt.show()
# making the vocab
counter = nlp.data.count_tokens(itertools.chain.from_iterable(tokenized))
vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None)
# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
vocab.set_embedding(ptr_embedding)
# saving vocab
with open('./data/vocab.pkl', mode='wb') as io:
pickle.dump(vocab, io)
# saving tr_data, val_data, tst_data
tr_data.to_csv('./data/train.txt', index=False, sep='\t')
val_data.to_csv('./data/val.txt', index=False, sep='\t')
tst_data.to_csv('./data/test.txt', index=False, sep='\t')
def get_args():
"""Construct the argument parser."""
parser = argparse.ArgumentParser(
description='Word embedding training with Gluon.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Embeddings arguments
group = parser.add_argument_group('Embedding arguments')
group.add_argument('--embedding-name', type=str, default='fasttext',
help=('Name of embedding type to load. '
'Valid entries: {}'.format(
', '.join(
nlp.embedding.list_sources().keys()))))
group.add_argument('--embedding-source', type=str, default='wiki.simple',
help=('Source from which to initialize the embedding.'
'Pass --list-embedding-sources to get a list of '
'valid sources for a given --embedding-name.'))
group.add_argument('--list-embedding-sources', action='store_true')
# Evaluation arguments
group = parser.add_argument_group('Evaluation arguments')
group.add_argument('--ignore-oov', action='store_true',
help='Drop OOV words from evaluation datasets.')
## Datasets
group.add_argument(
'--similarity-datasets', type=str,
default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
nargs='*',
help='Word similarity datasets to use for intrinsic evaluation.')