Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def init(tasks_base_path) -> Tuple[TaggedCorpus, TextRegressor, ModelTrainer]:
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION, tasks_base_path)
glove_embedding: WordEmbeddings = WordEmbeddings("glove")
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
[glove_embedding], 128, 1, False, 64, False, False
)
model = TextRegressor(document_embeddings)
trainer = ModelTrainer(model, corpus)
return corpus, model, trainer
def test_training(tasks_base_path):
corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, tasks_base_path)
tag_dictionary = corpus.make_tag_dictionary('ner')
embeddings = WordEmbeddings('glove')
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type='ner',
use_crf=False)
# initialize trainer
trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10)
# clean up results directory
def test_text_classifier_single_label(tasks_base_path):
corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
label_dict = corpus.make_label_dictionary()
glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)
model = TextClassifier(document_embeddings, label_dict, False)
trainer = TextClassifierTrainer(model, corpus, label_dict, False)
trainer.train('./results', max_epochs=2)
sentence = Sentence("Berlin is a really nice city.")
for s in model.predict(sentence):
for l in s.labels:
assert(l.value is not None)
assert(0.0 <= l.score <= 1.0)
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, MemoryEmbeddings, CharacterEmbeddings
from typing import List
import torch
# 1. get the corpus
columns = {0: 'text', 1: 'ner'}
corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus("data1", columns,
train_file="train.txt",
test_file="test.txt",
dev_file="dev.txt")
print(corpus)
# 2. what tag do we want to predict?
tag_type = 'ner'
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
# WordEmbeddings('glove'),
return NLPTaskDataFetcher.load_ud_corpus(data_folder)
# for text classifiers, we use our own special format
if task in [
NLPTask.IMDB.value,
NLPTask.AG_NEWS.value,
NLPTask.TREC_6.value,
NLPTask.TREC_50.value,
NLPTask.REGRESSION.value,
]:
use_tokenizer: bool = False if task in [
NLPTask.TREC_6.value,
NLPTask.TREC_50.value,
] else True
return NLPTaskDataFetcher.load_classification_corpus(
data_folder, use_tokenizer=use_tokenizer
)
# NER corpus for Basque
if task == NLPTask.NER_BASQUE.value:
columns = {0: "text", 1: "ner"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder, columns, tag_to_biloes="ner"
)
if task.startswith("wassa"):
return NLPTaskDataFetcher.load_classification_corpus(
data_folder, use_tokenizer=True
)
if (
task == NLPTask.CONLL_03.value
or task == NLPTask.ONTONER.value
or task == NLPTask.FASHION.value
):
columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder, columns, tag_to_biloes="ner"
)
# the CoNLL 03 task for German has an additional lemma column
if task == NLPTask.CONLL_03_GERMAN.value:
columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder, columns, tag_to_biloes="ner"
)
# the CoNLL 03 task for Dutch has no NP column
if task == NLPTask.CONLL_03_DUTCH.value or task.startswith("wikiner"):
columns = {0: "text", 1: "pos", 2: "ner"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder, columns, tag_to_biloes="ner"
)
# the CoNLL 03 task for Spanish only has two columns
if task == NLPTask.CONLL_03_SPANISH.value or task == NLPTask.WNUT_17.value:
columns = {0: "text", 1: "ner"}
return NLPTaskDataFetcher.load_column_corpus(
# read in test file if exists, otherwise sample 10% of train data as test dataset
if test_file is not None:
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_column_data(
test_file, column_format
)
else:
sentences_test: List[Sentence] = [
sentences_train[i]
for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
]
sentences_train = [x for x in sentences_train if x not in sentences_test]
# read in dev file if exists, otherwise sample 10% of train data as dev dataset
if dev_file is not None:
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_column_data(
dev_file, column_format
)
else:
sentences_dev: List[Sentence] = [
sentences_train[i]
for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
]
sentences_train = [x for x in sentences_train if x not in sentences_dev]
if tag_to_biloes is not None:
# convert tag scheme to iobes
for sentence in sentences_train + sentences_test + sentences_dev:
sentence.convert_tag_scheme(
tag_type=tag_to_biloes, target_scheme="iobes"
)
log.info("Test: {}".format(test_file))
# get train and test data
sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data(
train_file, column_format
)
# read in test file if exists, otherwise sample 10% of train data as test dataset
if test_file is not None:
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_column_data(
test_file, column_format
)
else:
sentences_test: List[Sentence] = [
sentences_train[i]
for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
]
sentences_train = [x for x in sentences_train if x not in sentences_test]
# read in dev file if exists, otherwise sample 10% of train data as dev dataset
if dev_file is not None:
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_column_data(
dev_file, column_format
)
else:
sentences_dev: List[Sentence] = [
sentences_train[i]
for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
]
sentences_train = [x for x in sentences_train if x not in sentences_dev]
if tag_to_biloes is not None:
def load_corpus(task: Union[NLPTask, str], base_path: [str, Path] = None) -> Corpus:
"""
Helper function to fetch a Corpus for a specific NLPTask. For this to work you need to first download
and put into the appropriate folder structure the corresponding NLP task data. The tutorials on
https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this
code to create your own data fetchers.
:param task: specification of the NLPTask you wish to get
:param base_path: path to data folder containing tasks sub folders
:return: a Corpus consisting of train, dev and test data
"""
# first, try to fetch dataset online
if type(task) is NLPTask:
NLPTaskDataFetcher.download_dataset(task)
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
if type(base_path) == str:
base_path: Path = Path(base_path)
# get string value if enum is passed
task = task.value if type(task) is NLPTask else task
data_folder = base_path / task.lower()
# the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk)
if task == NLPTask.CONLL_2000.value:
columns = {0: "text", 1: "pos", 2: "np"}
test_file = file
if "dev" in file_name:
dev_file = file
if "testa" in file_name:
dev_file = file
if "testb" in file_name:
test_file = file
log.info("Reading data from {}".format(data_folder))
log.info("Train: {}".format(train_file))
log.info("Dev: {}".format(dev_file))
log.info("Test: {}".format(test_file))
sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)
return Corpus(
sentences_train, sentences_dev, sentences_test, name=data_folder.name
)