Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _test_count_tokens(token_delim, seq_delim):
source_str = _get_test_str_of_tokens(token_delim, seq_delim)
tokens = list(simple_tokenize(source_str, token_delim, seq_delim))
cnt1 = nlp.data.count_tokens(tokens, to_lower=False)
assert cnt1 == nlp.data.utils.Counter(
{'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
'bad': 1})
cnt2 = nlp.data.count_tokens(tokens, to_lower=True)
assert cnt2 == nlp.data.utils.Counter(
{'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}), cnt2
counter_to_update = nlp.data.utils.Counter({'life': 2})
cnt3 = nlp.data.utils.count_tokens(tokens, to_lower=False,
counter=counter_to_update.copy())
assert cnt3 == nlp.data.utils.Counter(
{'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
'bad': 1})
tr_data, val_data = train_test_split(data, test_size=.2)
tst_filepath = proj_dir / 'data/ratings_test.txt'
tst_data = pd.read_csv(tst_filepath, sep='\t').loc[:, ['document', 'label']]
tst_data = tst_data.loc[tst_data['document'].isna().apply(lambda elm: not elm), :]
# extracting morph in sentences
tokenizer = MeCab()
tokenized = tr_data['document'].apply(tokenizer.morphs)
plt.hist(list(map(lambda elm: len(elm), tokenized)))
plt.show()
# making the vocab
counter = nlp.data.count_tokens(itertools.chain.from_iterable(tokenized))
vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None)
# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
vocab.set_embedding(ptr_embedding)
# saving vocab
with open('./data/vocab.pkl', mode='wb') as io:
pickle.dump(vocab, io)
# saving tr_data, val_data, tst_data
tr_data.to_csv('./data/train.txt', index=False, sep='\t')
val_data.to_csv('./data/val.txt', index=False, sep='\t')
tst_data.to_csv('./data/test.txt', index=False, sep='\t')
def build_vocab(file_list = ['crowd/dev.json', 'crowd/train_m.json', 'crowd/test.json', 'ontonotes/augmented_train.json', 'ontonotes/g_dev.json', 'ontonotes/g_test.json', 'distant_supervision/headword_train.json', 'distant_supervision/headword_dev.json', 'distant_supervision/el_dev.json', 'distant_supervision/el_train.json']):
data_path = "data/release/"
words = []
for file in file_list:
file_name = data_path + file
with open(file_name) as f:
line_elems = [json.loads(sent.strip()) for sent in f.readlines()]
mention_seq = [line_elem["mention_span"].split() for line_elem in line_elems]
left_seq = [line_elem['left_context_token'] for line_elem in line_elems]
right_seq = [line_elem['right_context_token'] for line_elem in line_elems]
for _ in mention_seq + right_seq + left_seq:
words += [tok.lower() for tok in _]
counter = gluonnlp.data.count_tokens(words)
vocab = gluonnlp.Vocab(counter)
with open('data/release/idx_to_token', 'w') as g:
g.write('\n'.join(vocab.idx_to_token))
with open('data/release/token_to_idx.json', 'w') as g:
json.dump(vocab.token_to_idx, g)
data = []
continue
except StopIteration:
print('parsing is done')
label_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[1], dataset)))
tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None)
label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None)
with open('./data/label_vocab.pkl', mode='wb') as io:
pickle.dump(label_vocab, io)
tr, val = train_test_split(dataset, test_size=.1, random_state=777)
token_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[0], tr)))
tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_token_vocab.set_embedding(ptr_embedding)
token_vocab = Vocab(tmp_token_vocab.idx_to_token)
token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy()
with open('./data/token_vocab.pkl', mode='wb') as io:
pickle.dump(token_vocab, io)
with open('./data/train.pkl', mode='wb') as io:
pickle.dump(tr, io)
with open('./data/validation.pkl', mode='wb') as io:
pickle.dump(val, io)
special_tokens['unknown_token'] = ''
# Discover special tokens
if [''] == corpus.vocab.special:
if '' in sym2idx: # Only include if special token is actually used
special_tokens['eos_token'] = ''
elif '<s>' in sym2idx:
# Special case for model trained on Google 1 Billion Word LM dataset
special_tokens['eos_token'] = '<s>'
elif corpus.vocab.special:
raise NotImplementedError('Provided TransformerXL cache.pkl uses an unknown special token. '
'You must extend the `to_gluon_vocab` method to support it.')
else:
special_tokens['eos_token'] = None
counter = nlp.data.count_tokens(sym2idx.keys())
vocab = nlp.vocab.Vocab(counter, token_to_idx=sym2idx, **special_tokens)
return vocab
</s></s>
tokens = evaluation.get_similarity_task_tokens(args_)
vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
with utils.print_time('set {} embeddings'.format(len(tokens))):
vocab.set_embedding(token_embedding_)
evaluation.evaluate_similarity(
args_, vocab.embedding, ctx, logfile=os.path.join(
args_.logdir, 'similarity{}.tsv'.format(name)))
if args_.analogy_datasets:
with utils.print_time('extend open vocabulary with '
'OOV tokens for analogy'):
tokens = evaluation.get_analogy_task_tokens(args_)
if token_embedding_.unknown_token is not None:
tokens.update(token_embedding_.idx_to_token[1:])
else:
tokens.update(token_embedding_.idx_to_token)
vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
with utils.print_time('set {} embeddings'.format(len(tokens))):
vocab.set_embedding(token_embedding_)
evaluation.evaluate_analogy(
args_, vocab.embedding, ctx, logfile=os.path.join(
args_.logdir, 'analogy{}.tsv'.format(name)))
def build_vocab(dataset):
"""
Build vocab given a dataset.
"""
counter = nlp.data.count_tokens([w for e in dataset for s in e[:2] for w in s],
to_lower=True)
vocab = nlp.Vocab(counter)
return vocab
Returns
-------
gluonnlp.data.DataStream
Each sample is a valid input to
gluonnlp.data.EmbeddingCenterContextBatchify.
gluonnlp.Vocab
Vocabulary of all tokens in Text8 that occur at least min_freq times of
maximum size max_vocab_size.
idx_to_counts : list of int
Mapping from token indices to their occurrence-counts in the Text8
dataset.
"""
with print_time('count and construct vocabulary'):
counter = nlp.data.count_tokens(itertools.chain.from_iterable(data))
vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
bos_token=None, eos_token=None, min_freq=min_freq,
max_size=max_vocab_size)
idx_to_counts = [counter[w] for w in vocab.idx_to_token]
def code(sentence):
return [vocab[token] for token in sentence if token in vocab]
with print_time('code data'):
data = data.transform(code, lazy=False)
data = nlp.data.SimpleDataStream([data])
return data, vocab, idx_to_counts
def __call__(self, example):
"""Maps examples into distinct tokens
Parameters
----------
example : dict
Example to process with context_tokens and ques_tokens keys
Returns
-------
mapped_values : List[Tuple]
Result of mapping process. Each tuple of (token, count) format
"""
para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
else [c for tkn in example['context_tokens'] for c in tkn])
ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
else [c for tkn in example['ques_tokens'] for c in tkn])
counter = para_counter + ques_counter
return list(counter.items())