Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
imdb_train, imdb_test = prenlp.data.IMDB()
# Preprocessing
tokenizer = NLTKMosesTokenizer()
for dataset in [imdb_train, imdb_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
# dataset[i][0] = text.strip() # original
# dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
# dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization
prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)
# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))
# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))
for text in wikitexko:
writer.write(normalizer.normalize(text.strip())+'\n')
# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [nsmc_train, nsmc_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))
prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)
# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))
# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
nsmc_train, nsmc_test = prenlp.data.NSMC()
# Preprocessing
tokenizer = Mecab()
for dataset in [nsmc_train, nsmc_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
# dataset[i][0] = text.strip() # original
# dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
# dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization
prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)
# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))
# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
for text in dataset:
writer.write(normalizer.normalize(text.strip())+'\n')
# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [imdb_train, imdb_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))
prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)
# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))
# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))
def trainer(filepath: str,
train_file: str,
model_path: str,
hyper_params: dict) -> None:
"""Train sentiment model using FastText:
https://fasttext.cc/docs/en/supervised-tutorial.html
"""
train = os.path.join(filepath, train_file)
model = fasttext.train_supervised(input=train, **hyper_params)
print("FastText model trained with hyperparameters: \n {}".format(hyper_params))
# Save models to model directory for fasttext
make_dirs(model_path)
model.save_model(os.path.join(model_path, "sst5_hyperopt.bin"))
# Quantize model to reduce space usage
model.quantize(input=train, qnorm=True, retrain=True, cutoff=110539)
model.save_model(os.path.join(model_path, "sst5_hyperopt.ftz"))