Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit(self, trn_path: str, **kwargs) -> int:
self.vocab = Vocab()
num_samples = 0
for x, y in self.file_to_inputs(trn_path):
self.vocab.update(x)
num_samples += 1
return num_samples
def fit(self, trn_path: str, **kwargs) -> int:
self.tag_vocab = Vocab(unk_token=None)
num_samples = 0
for words, tags in self.file_to_inputs(trn_path, gold=True):
num_samples += 1
self.tag_vocab.update(tags)
return num_samples
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
-> Tuple[Vocab, Vocab, Vocab]:
word_vocab = Vocab()
char_vocab = Vocab()
tag_vocab = Vocab(unk_token=None)
with open(tsv_file_path, encoding='utf-8') as tsv_file:
for line in tsv_file:
cells = line.strip().split()
if cells:
word, tag = cells
if lower:
word_vocab.add(word.lower())
else:
word_vocab.add(word)
char_vocab.update(list(word))
tag_vocab.add(tag)
if lock_word_vocab:
word_vocab.lock()
if lock_char_vocab:
char_vocab.lock()
def fit(self, trn_path: str, **kwargs) -> int:
self.word_vocab = Vocab()
self.tag_vocab = Vocab(pad_token=None, unk_token=None)
num_samples = 0
for words, tags in self.file_to_inputs(trn_path, True):
self.word_vocab.update(words)
self.tag_vocab.update(tags)
num_samples += 1
if self.char_vocab:
self.char_vocab = Vocab()
for word in self.word_vocab.token_to_idx.keys():
if word in (self.word_vocab.pad_token, self.word_vocab.unk_token):
continue
self.char_vocab.update(list(word))
return num_samples
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
-> Tuple[Vocab, Vocab, Vocab]:
word_vocab = Vocab()
char_vocab = Vocab()
tag_vocab = Vocab(unk_token=None)
with open(tsv_file_path, encoding='utf-8') as tsv_file:
for line in tsv_file:
cells = line.strip().split()
if cells:
word, tag = cells
if lower:
word_vocab.add(word.lower())
else:
word_vocab.add(word)
char_vocab.update(list(word))
tag_vocab.add(tag)
if lock_word_vocab:
word_vocab.lock()
if lock_char_vocab:
char_vocab.lock()
if lock_tag_vocab:
def create_label_vocab() -> Vocab:
return Vocab(pad_token=None, unk_token=None)
def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[Vocab, Vocab, Vocab]:
char_vocab, ngram_vocab, tag_vocab = Vocab(), Vocab(), Vocab(pad_token=None, unk_token=None)
for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True):
char_vocab.update(X[0])
for ngram in X[1:]:
ngram_vocab.update(filter(lambda x: x, ngram))
tag_vocab.update(Y)
return char_vocab, ngram_vocab, tag_vocab
def load_vocabs(self, save_dir, filename='vocabs.json'):
save_dir = get_resource(save_dir)
vocabs = SerializableDict()
vocabs.load_json(os.path.join(save_dir, filename))
for key, value in vocabs.items():
vocab = Vocab()
vocab.copy_from(value)
setattr(self.transform, key, vocab)