Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
assert text == sentence.to_original_text()
text = ": nation on"
sentence = Sentence(text, use_tokenizer=segtok_tokenizer)
assert text == sentence.to_original_text()
text = "I love Berlin."
sentence = Sentence(text)
assert text == sentence.to_original_text()
text = 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .'
sentence = Sentence(text)
assert text == sentence.to_original_text()
text = 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .'
sentence = Sentence(text, use_tokenizer=segtok_tokenizer)
assert text == sentence.to_original_text()
# init forward LM with 128 hidden states and 1 layer
language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)
# get the example corpus and process at character level in forward direction
corpus: TextCorpus = TextCorpus(str(Path(__file__).parent / 'resources/corpora/lorem_ipsum'),
dictionary,
language_model.is_forward_lm,
character_level=True)
# train the language model
trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus)
trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5)
# use the character LM as embeddings to embed the example sentence 'I love Berlin'
char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
sentence = Sentence('I love Berlin')
char_lm_embeddings.embed(sentence)
print(sentence[1].embedding.size())
# clean up results directory
shutil.rmtree('./results', ignore_errors=True)
def embed_sentence(
sentence: str,
pooling_operation,
layers: str = "1",
use_scalar_mix: bool = False,
) -> Sentence:
embeddings = CamembertEmbeddings(
pretrained_model_name_or_path=camembert_model,
layers=layers,
pooling_operation=pooling_operation,
use_scalar_mix=use_scalar_mix,
)
flair_sentence = Sentence(sentence)
embeddings.embed(flair_sentence)
return flair_sentence
def embed_sentence(
sentence: str,
pooling_operation,
layers: str = "1",
use_scalar_mix: bool = False,
) -> Sentence:
embeddings = OpenAIGPTEmbeddings(
pretrained_model_name_or_path=gpt_model,
layers=layers,
pooling_operation=pooling_operation,
use_scalar_mix=use_scalar_mix,
)
flair_sentence = Sentence(sentence)
embeddings.embed(flair_sentence)
return flair_sentence
def embed_sentence(
sentence: str,
pooling_operation,
layers: str = "1",
use_scalar_mix: bool = False,
) -> Sentence:
embeddings = RoBERTaEmbeddings(
pretrained_model_name_or_path=roberta_model,
layers=layers,
pooling_operation=pooling_operation,
use_scalar_mix=use_scalar_mix,
)
flair_sentence = Sentence(sentence)
embeddings.embed(flair_sentence)
return flair_sentence
def test_sentence_to_real_string(tasks_base_path):
sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=segtok_tokenizer)
assert "I love Berlin." == sentence.to_plain_string()
corpus = flair.datasets.GERMEVAL(base_path=tasks_base_path)
sentence = corpus.train[0]
assert (
'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .'
== sentence.to_tokenized_string()
)
assert (
'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".'
== sentence.to_plain_string()
)
sentence = corpus.train[1]
assert (
def embed_sentence(
sentence: str,
pooling_operation,
layers: str = "1",
use_scalar_mix: bool = False,
) -> Sentence:
embeddings = OpenAIGPT2Embeddings(
pretrained_model_name_or_path=gpt_model,
layers=layers,
pooling_operation=pooling_operation,
use_scalar_mix=use_scalar_mix,
)
flair_sentence = Sentence(sentence)
embeddings.embed(flair_sentence)
return flair_sentence
def get_sentences(text, lang, use_ontonotes, fast, use_embeddings, char_embeddings, bpe_size, expressions, pos, sentiment) -> List[Sentence]:
"""Process text using Flair and return the output from Flair"""
if lang not in ('en', 'multi', 'de', 'nl', 'fr'):
raise TypeError(
f'{lang} is not supported! Try multi. See https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md')
# tokenize sentences
sentences = []
for s in segment(text):
sentence = Sentence()
sentences.append(sentence)
for t in s:
sentence.add_token(Token(t.value, start_position=t.offset, whitespace_after=t.space_after))
# run models
for model in get_models(lang=lang, use_ontonotes=use_ontonotes, fast=fast, expressions=expressions, pos=pos, sentiment=sentiment):
model.predict(sentences)
# load embedding models
if use_embeddings or char_embeddings or bpe_size > 0:
get_embeddings([e.strip() for e in use_embeddings.split(',')], char_embeddings, lang, bpe_size).embed(sentences)
return sentences
def sent_to_flair(sent):
"""
Convert a tokenized sentence (list of words) to a Flair sentence object
"""
sentence = Sentence()
for w in sent:
token = Token(w)
sentence.add_token(token)
sentence.infer_space_after()
return sentence
def __getitem__(self, index: int = 0) -> Sentence:
text = self.texts[index]
return Sentence(text, use_tokenizer=self.use_tokenizer)