Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dep_outer_label = g['dep']
head_pos = g['pos']
elif dep_outer_id != head_id:
return False
if dep_outer_id is None:
print(gold_tokens[start:end], file=sys.stderr)
raise Exception('unexpected state')
elif start < dep_outer_id < end:
dep_outer_id = start
g = gold_tokens[start]
g['orth'] = replacing_token.orth_
g['lemma'] = replacing_token.lemma_
g['pos'] = replacing_token.pos_
g['tag'] = replacing_token.tag_
g['inf'] = ex_attr(replacing_token).inf
g['whitespace'] = replacing_token.whitespace_ != ''
g['head'] = dep_outer_id - start
if dep_outer_label.startswith('as_'):
g['dep'] = dep_outer_label
else:
dep = dep_outer_label.split('_as_')[0]
g['dep'] = dep if not extend_dep_labels or head_pos == g['pos'] else '{}_as_{}'.format(dep, head_pos)
for g in gold_tokens:
if g['id'] <= start and end <= g['id'] + g['head']:
g['head'] -= end - start - 1
elif g['id'] <= start < g['id'] + g['head']:
g['head'] = start - g['id']
elif g['id'] + g['head'] <= start and end <= g['id']:
g['head'] += end - start - 1
elif g['id'] + g['head'] < end <= g['id']:
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
Tokenizer = SudachiTokenizer
def make_doc(self, text):
return self.tokenizer(text)
def pickle_japanese(instance):
return Japanese, tuple()
copy_reg.pickle(Japanese, pickle_japanese)
__all__ = [
'Japanese',
]
def pickle_japanese(instance):
return Japanese, tuple()
):
if require_gpu:
spacy.require_gpu()
print("GPU enabled", file=sys.stderr)
if model_path:
nlp = spacy.load(model_path)
else:
nlp = spacy.load('ja_ginza')
if disable_pipes:
print("disabling pipes: {}".format(disable_pipes), file=sys.stderr)
nlp.disable_pipes(disable_pipes)
print("using : {}".format(nlp.pipe_names), file=sys.stderr)
if recreate_corrector:
if 'JapaneseCorrector' in nlp.pipe_names:
nlp.remove_pipe('JapaneseCorrector')
corrector = JapaneseCorrector(nlp)
nlp.add_pipe(corrector, last=True)
if mode == 'A':
nlp.tokenizer.mode = OriginalTokenizer.SplitMode.A
elif mode == 'B':
nlp.tokenizer.mode = OriginalTokenizer.SplitMode.B
elif mode == 'C':
nlp.tokenizer.mode = OriginalTokenizer.SplitMode.C
else:
raise Exception('mode should be A, B or C')
print("mode is {}".format(mode), file=sys.stderr)
if not use_sentence_separator:
print("disabling sentence separator", file=sys.stderr)
nlp.tokenizer.use_sentence_separator = False
if output_path:
Language.factories['JapaneseCorrector'] = lambda nlp, **cfg: JapaneseCorrector(nlp)
def create_model_path(output_dir, model_name, model_version):
return output_dir / '{}_{}-{}'.format(Japanese.lang, model_name, model_version)
def create_tokenizer(cls, nlp=None):
return SudachiTokenizer(nlp)
last_morph = m
else:
last_morph = m
if last_morph:
morph_spaces.append((last_morph, False))
# the last space is removed by JapaneseReviser at the final stage of pipeline
words = [m.surface() for m, spaces in morph_spaces]
spaces = [space for m, space in morph_spaces]
doc = Doc(self.nlp.vocab if self.nlp else Vocab(), words=words, spaces=spaces)
next_tag = morph_tag(morph_spaces[0][0].part_of_speech()[0:4]) if len(doc) else ''
for token, (morph, spaces) in zip(doc, morph_spaces):
tag = next_tag
next_tag = morph_tag(morph_spaces[token.i + 1][0].part_of_speech()[0:4]) if token.i < len(doc) - 1 else ''
token.tag_ = tag
token.pos = TAG_MAP[tag][POS]
# TODO separate lexical rules to resource files
if morph.normalized_form() == '為る' and tag == '動詞-非自立可能':
token.pos_ = 'AUX'
elif tag == '名詞-普通名詞-サ変可能':
if next_tag == '動詞-非自立可能':
token.pos_ = 'VERB'
elif tag == '名詞-普通名詞-サ変形状詞可能':
if next_tag == '動詞-非自立可能':
token.pos_ = 'VERB'
elif next_tag == '助動詞' or next_tag.find('形状詞') >= 0:
token.pos_ = 'ADJ'
token.lemma_ = morph.normalized_form()
token._.inf = ','.join(morph.part_of_speech()[4:])
token._.reading = morph.reading_form()
token._.sudachi = morph
if self.use_sentence_separator: