Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_training(tasks_base_path):
corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, tasks_base_path)
tag_dictionary = corpus.make_tag_dictionary('ner')
embeddings = WordEmbeddings('glove')
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type='ner',
use_crf=False)
# initialize trainer
trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10)
# clean up results directory
shutil.rmtree('./results')
# WordEmbeddings('glove'),
# comment in this line to use character embeddings
# CharacterEmbeddings(),
# comment in these lines to use contextual string embeddings
# CharLMEmbeddings('news-forward'),
# CharLMEmbeddings('news-backward'),
]
# embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
embeddings = MemoryEmbeddings(tag_type=tag_type, tag_dictionary=tag_dictionary)
# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True)
# 6. initialize trainer
from flair.trainers import SequenceTaggerTrainer
trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus)
# 7. start training
trainer.train('resources/taggers/example-ner',
learning_rate=0.1,
mini_batch_size=128,
max_epochs=150)
corpus: Corpus = prepare_flair_train_dev_corpus(
spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment
)
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
print(tag_dictionary.idx2item)
embedding_types: List[TokenEmbeddings] = [
WordEmbeddings("fr"),
FlairEmbeddings("fr-forward"),
FlairEmbeddings("fr-backward"),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
tagger: SequenceTagger = SequenceTagger(
hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type="ner"
)
trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=False)
# TODO optimize LR https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
trainer.train(
model_folder,
max_epochs=nb_epochs,
learning_rate=0.1,
mini_batch_size=32,
embeddings_storage_mode="cpu",
checkpoint=False,
)
tag_type = "ner"
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
embedding_types: List[TokenEmbeddings] = [
WordEmbeddings("glove"),
FlairEmbeddings("news-forward"),
FlairEmbeddings("news-backward"),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(
hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True,
)
# 6. initialize trainer
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
# 7. start training
trainer.train(model_path, learning_rate=0.1, mini_batch_size=16, max_epochs=10)
# under the License.
import os
import random
from typing import List
import spacy
from flair.data import Sentence, build_spacy_tokenizer
from flair.models import SequenceTagger
from ner.model_factory import get_tokenizer
from resources.config_provider import get_config_default
from xml_extractions.extract_node_values import Paragraph, get_paragraph_from_file
random.seed(5)
tagger: SequenceTagger = SequenceTagger.load('resources/flair_ner/ca/best-model.pt')
config_training = get_config_default()
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
tokenizer = build_spacy_tokenizer(nlp)
xml_train_path = "../similar_legal_case/data/jurica_original_xml/arrets-juri" # config_training["xml_train_path"]
files = [os.path.join(path, name) for path, _, files in os.walk(xml_train_path) for name in files]
random.shuffle(files)
with open("./resources/training_data/generated_annotations.txt", mode='w') as generated_text:
with open("./resources/training_data/generated_annotations.ent", mode='w') as generated_entities:
for filename in files:
if filename.endswith(".xml"):
try:
print(f"--- {filename} ---")
tag_type = "ner"
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
embedding_types: List[TokenEmbeddings] = []
if self.component_config["use_glove_embeddings"]:
embedding_types.append(WordEmbeddings("glove"))
if self.component_config["use_flair_embeddings"]:
embedding_types.append(FlairEmbeddings("news-forward"))
embedding_types.append(FlairEmbeddings("news-backward"))
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
tagger: SequenceTagger = SequenceTagger(
hidden_size=self.component_config["hidden_size"],
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True,
)
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train(
self.model_path,
learning_rate=self.component_config["learning_rate"],
mini_batch_size=self.component_config["mini_batch_size"],
max_epochs=self.component_config["max_epochs"],
)
sentences: List[Sentence] = list()
with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar:
for filename in filenames:
paragraphs: List[Paragraph] = get_paragraph_from_file(
path=os.path.join(data_folder, filename), keep_paragraph_without_annotation=True
)
if len(paragraphs) > top_n:
for paragraph in paragraphs[:top_n]:
if len(paragraph.text) > 0:
s = Sentence(text=paragraph.text, tokenizer=tokenizer)
sentences.append(s)
progress_bar.update()
if len(sentences) == 0:
raise Exception("No example loaded, causes: no cases in provided path or sample size is to high")
tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, "best-model.pt"))
_ = tagger.predict(sentences=sentences, mini_batch_size=32, verbose=True)
print("prepare html")
page_html = render_ner_html(sentences, colors=colors)
print("write html")
with open("sentence.html", "w") as writer:
writer.write(page_html)