Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
corpus_1 = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
)
corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)
corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_tag_dictionary("ner")
embeddings = WordEmbeddings("turian")
tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type="ner",
use_crf=False,
)
# initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train(
results_base_path,
learning_rate=0.1,
mini_batch_size=2,
def test_text_classifier_single_label(tasks_base_path):
corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
label_dict = corpus.make_label_dictionary()
glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)
model = TextClassifier(document_embeddings, label_dict, False)
trainer = TextClassifierTrainer(model, corpus, label_dict, False)
trainer.train('./results', max_epochs=2)
sentence = Sentence("Berlin is a really nice city.")
for s in model.predict(sentence):
for l in s.labels:
assert(l.value is not None)
assert(0.0 <= l.score <= 1.0)
assert(type(l.score) is float)
# clean up results directory
def test_train_classifier_with_sampler(results_base_path, tasks_base_path):
corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
label_dict = corpus.make_label_dictionary()
word_embedding: WordEmbeddings = WordEmbeddings("turian")
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
[word_embedding], 32, 1, False, 64, False, False
)
model: TextClassifier = TextClassifier(document_embeddings, label_dict, False)
trainer = ModelTrainer(model, corpus)
trainer.train(
results_base_path,
max_epochs=2,
shuffle=False,
sampler=ImbalancedClassificationDatasetSampler,
)
sentence = Sentence("Berlin is a really nice city.")
def test_text_classifier_mulit_label(tasks_base_path):
corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
label_dict = corpus.make_label_dictionary()
glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings([glove_embedding])
model = TextClassifier(document_embeddings, label_dict, True)
trainer = TextClassifierTrainer(model, corpus, label_dict, False)
trainer.train('./results', max_epochs=2)
sentence = Sentence("Berlin is a really nice city.")
for s in model.predict(sentence):
for l in s.labels:
assert(l.value is not None)
assert(0.0 <= l.score <= 1.0)
assert(type(l.score) is float)
# clean up results directory
if args.include_weight:
model_folder += '_w'
# print(column_format)
corpus: Corpus = NLPTaskDataFetcher.load_column_corpus(data_folder,
column_format=column_format,
tag_to_biloes="ner")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
embedding_types: List[TokenEmbeddings] = [
# GloVe embeddings
WordEmbeddings('glove'),
# contextual string embeddings, forward
FlairEmbeddings('news-forward'),
# PooledFlairEmbeddings('news-forward', pooling='min'),
# contextual string embeddings, backward
FlairEmbeddings('news-backward'),
# PooledFlairEmbeddings('news-backward', pooling='min'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
tagger: SequenceTagger = WeightedSequenceTagger(hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type)
def load_context_embeddings_with_flair(direction='bi', word_embeddings=True,
cache_dir=DEFAULT_CACHE_DIR,
verbose=False):
"""
:param bidirectional:
:param cache_dir:
:param verbose:
"""
from flair.embeddings import FlairEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import StackedEmbeddings
embeddings = []
if word_embeddings:
fasttext_embedding = WordEmbeddings('da')
embeddings.append(fasttext_embedding)
if direction == 'bi' or direction == 'fwd':
fwd_weight_path = download_model('flair.fwd', cache_dir,
verbose=verbose,
process_func=_unzip_process_func)
embeddings.append(FlairEmbeddings(fwd_weight_path))
if direction == 'bi' or direction == 'bwd':
bwd_weight_path = download_model('flair.bwd', cache_dir,
verbose=verbose,
process_func=_unzip_process_func)
embeddings.append(FlairEmbeddings(bwd_weight_path))
if len(embeddings) == 1:
return embeddings[0]
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, 'agnews/').downsample(0.1)
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()
# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),
# comment in flair embeddings for state-of-the-art results
FlairEmbeddings('news-forward'),
FlairEmbeddings('news-backward'),
]
# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings,
hidden_size=512,
reproject_words=True,
reproject_words_dimension=256,
)
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False, attention=True)
"""Train sentiment model using Flair NLP library:
https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
"""
# pip install flair allennlp
from flair.datasets import ClassificationCorpus
from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
from flair.visual.training_curves import Plotter
if stack == "glove":
from flair.embeddings import WordEmbeddings
stacked_embedding = WordEmbeddings('glove')
elif stack == "elmo":
from flair.embeddings import ELMoEmbeddings
stacked_embedding = ELMoEmbeddings('original')
elif stack == "bert":
from flair.embeddings import BertEmbeddings
stacked_embedding = BertEmbeddings('bert-base-cased')
else:
stacked_embedding = None
# Define and Load corpus from the provided dataset
train, dev, test = filenames
corpus = ClassificationCorpus(
file_path,
train_file=train,
dev_file=dev,
test_file=test,
train_file="train.txt",
test_file="test.txt",
dev_file="dev.txt")
print(corpus)
# 2. what tag do we want to predict?
tag_type = 'ner'
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
CharacterEmbeddings(),
WordEmbeddings("tmp/glove.1.8G.bin")
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=1024,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True)
# 6. initialize trainer
from flair.trainers import SequenceTaggerTrainer
def get_embeddings(embeddings: List[str], character: bool, lang: str, bpe_size: int) -> StackedEmbeddings:
"""To Construct and return a embedding model"""
stack = []
for e in embeddings:
if e != '':
if 'forward' in e or 'backward' in e:
stack.append(FlairEmbeddings(e))
else:
stack.append(WordEmbeddings(e))
if character:
stack.append(CharacterEmbeddings())
if bpe_size > 0:
stack.append(BytePairEmbeddings(language=lang, dim=bpe_size))
return StackedEmbeddings(embeddings=stack)