Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def mock_ner_span(text, tag, start, end):
span = Span([])
span.tag = tag
span.start_pos = start
span.end_pos = end
span.tokens = [Token(text[start:end])]
return span
def test_sentence_add_token():
token1: Token = Token("Munich")
token2: Token = Token("and")
token3: Token = Token("Berlin")
token4: Token = Token("are")
token5: Token = Token("nice")
sentence: Sentence = Sentence()
sentence.add_token(token1)
sentence.add_token(token2)
sentence.add_token(token3)
sentence.add_token(token4)
sentence.add_token(token5)
sentence.add_token("cities")
sentence.add_token(Token("."))
def test_get_head():
token1 = Token("I", 0)
token2 = Token("love", 1, 0)
token3 = Token("Berlin", 2, 1)
sentence: Sentence = Sentence()
sentence.add_token(token1)
sentence.add_token(token2)
sentence.add_token(token3)
assert token2 == token3.get_head()
assert token1 == token2.get_head()
assert None == token1.get_head()
def test_sentence_add_token():
token1: Token = Token("Munich")
token2: Token = Token("and")
token3: Token = Token("Berlin")
token4: Token = Token("are")
token5: Token = Token("nice")
sentence: Sentence = Sentence()
sentence.add_token(token1)
sentence.add_token(token2)
sentence.add_token(token3)
sentence.add_token(token4)
sentence.add_token(token5)
sentence.add_token("cities")
sentence.add_token(Token("."))
assert "Munich and Berlin are nice cities ." == sentence.to_tokenized_string()
):
line = file.readline()
continue
if self.__line_completes_sentence(line):
if len(sentence) > 0:
sentence.infer_space_after()
if self.tag_to_bioes is not None:
sentence.convert_tag_scheme(
tag_type=self.tag_to_bioes, target_scheme="iobes"
)
return sentence
else:
fields: List[str] = re.split("\s+", line)
token = Token(fields[self.text_column])
for column in self.column_name_map:
if len(fields) > column:
if column != self.text_column:
token.add_tag(
self.column_name_map[column], fields[column]
)
if not line.isspace():
sentence.add_token(token)
line = file.readline()
return sentence
def get_sentences(text, lang, use_ontonotes, fast, use_embeddings, char_embeddings, bpe_size, expressions, pos, sentiment) -> List[Sentence]:
"""Process text using Flair and return the output from Flair"""
if lang not in ('en', 'multi', 'de', 'nl', 'fr'):
raise TypeError(
f'{lang} is not supported! Try multi. See https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md')
# tokenize sentences
sentences = []
for s in segment(text):
sentence = Sentence()
sentences.append(sentence)
for t in s:
sentence.add_token(Token(t.value, start_position=t.offset, whitespace_after=t.space_after))
# run models
for model in get_models(lang=lang, use_ontonotes=use_ontonotes, fast=fast, expressions=expressions, pos=pos, sentiment=sentiment):
model.predict(sentences)
# load embedding models
if use_embeddings or char_embeddings or bpe_size > 0:
get_embeddings([e.strip() for e in use_embeddings.split(',')], char_embeddings, lang, bpe_size).embed(sentences)
return sentences
def tokenizer(text: str) -> List[Token]:
"""
Tokenizer using tiny_tokenizer, a third party library which supports
multiple Japanese tokenizer such as MeCab, KyTea and SudachiPy.
"""
tokens: List[Token] = []
words: List[str] = []
sentences = sentence_tokenizer.tokenize(text)
for sentence in sentences:
tiny_tokenizer_tokens = word_tokenizer.tokenize(sentence)
words.extend(list(map(str, tiny_tokenizer_tokens)))
# determine offsets for whitespace_after field
index = text.index
current_offset = 0
previous_word_offset = -1
previous_token = None
for word in words:
try:
word_offset = index(word, current_offset)
start_position = word_offset
def sent_to_flair(sent):
"""
Convert a tokenized sentence (list of words) to a Flair sentence object
"""
sentence = Sentence()
for w in sent:
token = Token(w)
sentence.add_token(token)
sentence.infer_space_after()
return sentence
index = text.index
current_offset = 0
previous_word_offset = -1
previous_token = None
for word in words:
try:
word_offset = index(word, current_offset)
start_position = word_offset
except:
word_offset = previous_word_offset + 1
start_position = (
current_offset + 1 if current_offset > 0 else current_offset
)
if word:
token = Token(
text=word, start_position=start_position, whitespace_after=True
)
tokens.append(token)
if (previous_token is not None) and word_offset - 1 == previous_word_offset:
previous_token.whitespace_after = False
current_offset = word_offset + len(word)
previous_word_offset = current_offset - 1
previous_token = token
return tokens
# determine offsets for whitespace_after field
index = text.index
current_offset = 0
previous_word_offset = -1
previous_token = None
for word in words:
try:
word_offset = index(word, current_offset)
start_position = word_offset
except:
word_offset = previous_word_offset + 1
start_position = (
current_offset + 1 if current_offset > 0 else current_offset
)
token = Token(
text=word, start_position=start_position, whitespace_after=True
)
tokens.append(token)
if (previous_token is not None) and word_offset - 1 == previous_word_offset:
previous_token.whitespace_after = False
current_offset = word_offset + len(word)
previous_word_offset = current_offset - 1
previous_token = token
return tokens