Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(data_folder: str, output_folder: str, model_folder: str) -> None:
nlp: Language = spacy.blank(name="fr")
nlp.tokenizer = get_tokenizer(nlp)
tokenizer = build_spacy_tokenizer(nlp)
filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".txt")]
tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, "best-model.pt"))
for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"):
with open(os.path.join(data_folder, filename), "r") as input_f:
sentences = tagger.predict(
sentences=input_f.readlines(), mini_batch_size=32, verbose=False, use_tokenizer=tokenizer
)
case_name = filename.split(".")[0]
page_html = render_ner_html(sentences, colors=colors, title=case_name)
with open(os.path.join(output_folder, case_name + ".html"), "w") as output:
output.write(page_html)
self.features = d["features"]
self.model_type = "crfsuite"
else: # Assume flair model
p = StdOutFilter()
p.start()
# Silently import flair/torch
import flair
from flair.data import Sentence
from flair.models import SequenceTagger
import torch
p.end()
self.tagger = SequenceTagger.load(model_path)
self.model_type = "flair"
def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.pos', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model
def load_flair_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True,
)
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train(
self.model_path,
learning_rate=self.component_config["learning_rate"],
mini_batch_size=self.component_config["mini_batch_size"],
max_epochs=self.component_config["max_epochs"],
)
self.tagger = SequenceTagger.load(
os.path.join(self.model_path, "final-model.pt")
)
def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.pos', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model
def evaluate_model(
model_path: Text, test_data: TrainingData, interpreter: Interpreter
) -> Dict:
from flair.data import Sentence
from flair.models import SequenceTagger
tagger = SequenceTagger.load(os.path.join(model_path, "final-model.pt"))
interpreter.pipeline = remove_pretrained_extractors(interpreter.pipeline)
entity_results = []
for ex in test_data.training_examples:
result = interpreter.parse(ex.text, only_output_properties=False)
text = " ".join([t.text.strip() for t in result.get("tokens")])
sentence = Sentence(text)
tagger.predict(sentence)
spans = sentence.get_spans("ner")
predicted_entities = []
for s in spans:
predicted_entities.append(
def main(
data_folder: str, model_folder: str, dev_size: float, nb_segment: Optional[int], segment: Optional[int]
) -> None:
nlp = spacy.blank(name="fr")
nlp.tokenizer = get_tokenizer(nlp)
corpus: Corpus = prepare_flair_train_dev_corpus(
spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment
)
print(corpus)
# flair.device = torch.device('cpu') # (4mn 28)
tagger: SequenceTagger = SequenceTagger.load(model=os.path.join(model_folder, "best-model.pt"))
test_results, _ = tagger.evaluate(sentences=corpus.dev, mini_batch_size=32)
print(test_results.detailed_results)
sentences_predict = copy.deepcopy(corpus.dev.sentences)
# clean tokens in case there is a bug
for s in sentences_predict:
for t in s:
t.tags = {}
_ = tagger.predict(sentences=sentences_predict, mini_batch_size=32, embedding_storage_mode="none", verbose=True)
def span_to_str(span: Span) -> str:
start_token = span.tokens[0].idx
end_token = span.tokens[len(span.tokens) - 1].idx
token_position = f"{start_token}" if start_token == end_token else f"{start_token}-{end_token}"
return f"{span.text} [{span.tag}] ({token_position})"
def predict(self, model_dir=os.path.normpath(r'.resources/taggers/slow_bert/final-model.pt'), input_string='I love Berlin'):
from flair.models import SequenceTagger
# load the model you trained
tagger = SequenceTagger.load(model_dir)
# create example sentence
sentence = Sentence(input_string)
# predict tags and print
tagger.predict(sentence)
print(sentence.to_tagged_string())