Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
token = Token(
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
yomi=yomi,
pron=pron,
)
return_result.append(token)
else:
for surface in parse_result.split(" "):
return_result.append(Token(surface=surface))
return return_result
def tokenize(self, text: str) -> List[Token]:
return_result = []
parse_result = self._tokenizer.tokenize(text)
if self._with_postag:
for morph in parse_result:
surface = morph.surface
postag, postag2, postag3, postag4 = morph.part_of_speech.split(",")
inflection = morph.infl_type
conjugation = morph.infl_form
base_form = morph.base_form
yomi = morph.reading
pron = morph.phonetic
token = Token(
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
yomi=yomi,
pron=pron,
)
return_result.append(token)
else:
for morph in parse_result:
return_result.append(Token(surface=morph.surface))
response = self._tokenizer.getTagsToString(text)
# FIXME Following dirty workaround is required to
# process inputs which include itself
# (e.g. "私 は猫")
response = response.replace("\\ ", "").replace(" ", " ")
for elem in response.split(" ")[:-1]:
# FIXME If input contains a character "/",
# KyTea outputs "//補助記号/・",
# which breaks the simple logic elem.split("/")
pron, postag, surface = map(
lambda e: e[::-1], elem[::-1].split("/", maxsplit=2)
)
surface = surface.replace("", " ")
tokens.append(Token(surface=surface, postag=postag, pron=pron))
else:
for surface in list(self._tokenizer.getWS(text)):
tokens.append(Token(surface=surface))
return tokens
if self._with_postag:
for elem in parse_result.split("\n")[:-1]:
(
surface,
postag,
postag2,
postag3,
postag4,
inflection,
conjugation,
base_form,
yomi,
pron,
) = self._parse_feature(elem)
token = Token(
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
yomi=yomi,
pron=pron,
)
return_result.append(token)
else:
for surface in parse_result.split(" "):
return_result.append(Token(surface=surface))
# (e.g. "私 は猫")
response = response.replace("\\ ", "").replace(" ", " ")
for elem in response.split(" ")[:-1]:
# FIXME If input contains a character "/",
# KyTea outputs "//補助記号/・",
# which breaks the simple logic elem.split("/")
pron, postag, surface = map(
lambda e: e[::-1], elem[::-1].split("/", maxsplit=2)
)
surface = surface.replace("", " ")
tokens.append(Token(surface=surface, postag=postag, pron=pron))
else:
for surface in list(self._tokenizer.getWS(text)):
tokens.append(Token(surface=surface))
return tokens
def tokenize(self, text: str) -> List[Token]:
return [Token(surface=surface) for surface in text.split(" ")]
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
yomi=yomi,
pron=pron,
)
return_result.append(token)
else:
for morph in parse_result:
return_result.append(Token(surface=morph.surface))
return return_result
for token in self._tokenizer.tokenize(text, self._mode):
surface = token.surface()
if self._with_postag:
(
postag,
postag2,
postag3,
postag4,
inflection,
conjugation,
) = token.part_of_speech()
base_form = token.dictionary_form()
normalized_form = token.normalized_form()
yomi = token.reading_form()
result.append(
Token(
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
normalized_form=normalized_form,
yomi=yomi,
)
)
else:
result.append(Token(surface=surface))
return result
def tokenize(self, text: str) -> List[Token]:
result = []
for subword in self._tokenizer.EncodeAsPieces(text):
token = Token(surface=subword)
result.append(token)
return result
Token(
surface=surface,
postag=postag,
postag2=postag2,
postag3=postag3,
postag4=postag4,
inflection=inflection,
conjugation=conjugation,
base_form=base_form,
normalized_form=normalized_form,
yomi=yomi,
)
)
else:
result.append(Token(surface=surface))
return result