How to use the fasttext.train_supervised function in fasttext

To help you get started, we’ve selected a few fasttext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lyeoni / prenlp / examples / fasttext_imdb.py View on Github external
imdb_train, imdb_test = prenlp.data.IMDB()

# Preprocessing
tokenizer = NLTKMosesTokenizer()
for dataset in [imdb_train, imdb_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
        # dataset[i][0] = text.strip() # original
        # dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
        # dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization

prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
         
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)

# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))

# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))
github lyeoni / prenlp / examples / fasttext_nsmc_sentencepiece.py View on Github external
for text in wikitexko:
        writer.write(normalizer.normalize(text.strip())+'\n')

# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [nsmc_train, nsmc_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))

prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
         
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)

# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))

# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
github lyeoni / prenlp / examples / fasttext_nsmc.py View on Github external
nsmc_train, nsmc_test = prenlp.data.NSMC()

# Preprocessing
tokenizer = Mecab()
for dataset in [nsmc_train, nsmc_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
        # dataset[i][0] = text.strip() # original
        # dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
        # dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization

prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
         
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)

# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))

# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
github lyeoni / prenlp / examples / fasttext_imdb_sentencepiece.py View on Github external
for text in dataset:
            writer.write(normalizer.normalize(text.strip())+'\n')

# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [imdb_train, imdb_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))

prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
         
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)

# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))

# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))
github prrao87 / fine-grained-sentiment / training / train_fasttext.py View on Github external
def trainer(filepath: str,
            train_file: str,
            model_path: str,
            hyper_params: dict) -> None:
    """Train sentiment model using FastText:
    https://fasttext.cc/docs/en/supervised-tutorial.html
    """
    train = os.path.join(filepath, train_file)
    model = fasttext.train_supervised(input=train, **hyper_params)
    print("FastText model trained with hyperparameters: \n {}".format(hyper_params))
    # Save models to model directory for fasttext
    make_dirs(model_path)
    model.save_model(os.path.join(model_path, "sst5_hyperopt.bin"))
    # Quantize model to reduce space usage
    model.quantize(input=train, qnorm=True, retrain=True, cutoff=110539)
    model.save_model(os.path.join(model_path, "sst5_hyperopt.ftz"))