Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
last_morph = m
else:
last_morph = m
if last_morph:
morph_spaces.append((last_morph, False))
# the last space is removed by JapaneseReviser at the final stage of pipeline
words = [m.surface() for m, spaces in morph_spaces]
spaces = [space for m, space in morph_spaces]
doc = Doc(self.nlp.vocab if self.nlp else Vocab(), words=words, spaces=spaces)
next_tag = morph_tag(morph_spaces[0][0].part_of_speech()[0:4]) if len(doc) else ''
for token, (morph, spaces) in zip(doc, morph_spaces):
tag = next_tag
next_tag = morph_tag(morph_spaces[token.i + 1][0].part_of_speech()[0:4]) if token.i < len(doc) - 1 else ''
token.tag_ = tag
token.pos = TAG_MAP[tag][POS]
# TODO separate lexical rules to resource files
if morph.normalized_form() == '為る' and tag == '動詞-非自立可能':
token.pos_ = 'AUX'
elif tag == '名詞-普通名詞-サ変可能':
if next_tag == '動詞-非自立可能':
token.pos_ = 'VERB'
elif tag == '名詞-普通名詞-サ変形状詞可能':
if next_tag == '動詞-非自立可能':
token.pos_ = 'VERB'
elif next_tag == '助動詞' or next_tag.find('形状詞') >= 0:
token.pos_ = 'ADJ'
token.lemma_ = morph.normalized_form()
token._.inf = ','.join(morph.part_of_speech()[4:])
token._.reading = morph.reading_form()
token._.sudachi = morph
if self.use_sentence_separator: