Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_word_embedding_analogy_evaluation_models(analogy_function):
dataset = nlp.data.GoogleAnalogyTestSet()
dataset = [d for i, d in enumerate(dataset) if i < 10]
embedding = nlp.embedding.create('fasttext', source='wiki.simple')
counter = nlp.data.utils.Counter(embedding.idx_to_token)
vocab = nlp.vocab.Vocab(counter)
vocab.set_embedding(embedding)
dataset_coded = [[vocab[d[0]], vocab[d[1]], vocab[d[2]], vocab[d[3]]]
for d in dataset]
dataset_coded_nd = nd.array(dataset_coded, dtype=np.int64)
for k in [1, 3]:
for exclude_question_words in [True, False]:
evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
idx_to_vec=vocab.embedding.idx_to_vec,
analogy_function=analogy_function, k=k,
exclude_question_words=exclude_question_words)
evaluator.initialize()
def test_token_embedding_from_file_S3_with_custom_unknown_token(unknown_token):
nlp.embedding.create('glove', source='glove.6B.50d',
unknown_token=unknown_token)
from collections import Counter
from pathlib import Path
from model.split import Stemmer
from model.utils import Vocab
from utils import Config
# loading dataset
sample_dir = Path('sample')
config = Config("conf/dataset/sample.json")
tr = pd.read_csv(config.train, sep='\t')
# korean vocab
split_ko = Stemmer(language='ko')
count_ko = Counter(itertools.chain.from_iterable(tr['ko'].apply(split_ko.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_ko, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko', load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()
vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab_ko.embedding = array
vocab_ko_filepath = sample_dir / "vocab_ko.pkl"
config.update({"source_vocab": str(vocab_ko_filepath)})
with open(vocab_ko_filepath, mode='wb') as io:
pickle.dump(vocab_ko, io)
# english vocab
split_en = Stemmer(language='en')
count_en = Counter(itertools.chain.from_iterable(tr['en'].apply(split_en.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_en)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple', load_ngrams=True)
def load_embedding_from_gluonnlp(args):
if args.embedding_name.lower() == 'fasttext':
token_embedding = nlp.embedding.create(
args.embedding_name,
source=args.embedding_source,
load_ngrams=args.fasttext_load_ngrams)
else:
token_embedding = nlp.embedding.create(
args.embedding_name, source=args.embedding_source)
return token_embedding
Returns
-------
Vocab
Word level vocabulary
"""
if self._options.word_vocab_path and isfile(self._options.word_vocab_path):
return pickle.load(open(self._options.word_vocab_path, 'rb'))
all_words = []
for dataset in self._datasets:
all_words.extend(self._get_all_word_tokens(dataset))
word_level_vocab = VocabProvider._create_squad_vocab(all_words)
word_level_vocab.set_embedding(
nlp.embedding.create('glove', source='glove.6B.{}d'.format(embedding_size)))
count = 0
for i in range(len(word_level_vocab)):
if (word_level_vocab.embedding.idx_to_vec[i].sum() != 0).asscalar():
count += 1
print('{}/{} words have embeddings'.format(count, len(word_level_vocab)))
if self._options.word_vocab_path:
pickle.dump(word_level_vocab, open(self._options.word_vocab_path, 'wb'))
return word_level_vocab
from collections import Counter
from model.split import split_morphs
from model.utils import Vocab
from utils import Config
qpair_dir = Path("qpair")
config = Config("conf/dataset/qpair.json")
train = pd.read_csv(config.train, sep="\t")
list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens = list_of_tokens_qa + list_of_tokens_qb
count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)
vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy()
with open(qpair_dir / "vocab.pkl", mode="wb") as io:
pickle.dump(vocab, io)
config.update({"vocab": str(qpair_dir / "vocab.pkl")})
config.save("conf/dataset/qpair.json")
total=len(word_partitioned)))
print('Word counters received in {:.3f} sec'.format(time.time() - tic))
tic = time.time()
print('Char counters receiving started.')
char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
char_reducer = SQuADAsyncVocabReducer()
char_mapped = list(
tqdm.tqdm(char_mapper.run_async(itertools.chain(train_examples, dev_examples), pool),
total=len(train_examples) + len(dev_examples)))
char_partitioned = SQuADDataPipeline._partition(itertools.chain(*char_mapped))
char_counts = list(tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
total=len(char_partitioned)))
print('Char counters received in {:.3f} sec'.format(time.time() - tic))
embedding = nlp.embedding.create('glove', source=emb_file_name)
if is_cased_embedding:
word_counts = itertools.chain(*[[(item[0], item[1]),
(item[0].lower(), item[1]),
(item[0].capitalize(), item[1]),
(item[0].upper(), item[1])] for item in word_counts])
else:
word_counts = [(item[0].lower(), item[1]) for item in word_counts]
word_vocab = Vocab({item[0]: item[1] for item in word_counts if
not shrink_word_vocab or item[0] in embedding.token_to_idx},
bos_token=None, eos_token=None)
word_vocab.set_embedding(embedding)
char_vocab = Vocab({item[0]: item[1] for item in char_counts},
bos_token=None, eos_token=None)
# loading dataset
nsmc_dir = Path("nsmc")
config = Config("conf/dataset/nsmc.json")
tr = pd.read_csv(config.train, sep="\t").loc[:, ["document", "label"]]
# extracting morph in sentences
list_of_tokens = tr["document"].apply(split_morphs).tolist()
# generating the vocab
token_counter = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(
counter=token_counter, min_freq=10, bos_token=None, eos_token=None
)
# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko")
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()
vocab = Vocab(
tmp_vocab.idx_to_token,
padding_token="",
unknown_token="",
bos_token=None,
eos_token=None,
)
vocab.embedding = array
# saving vocab
with open(nsmc_dir / "vocab.pkl", mode="wb") as io:
pickle.dump(vocab, io)