How to use ginza - 8 common examples

To help you get started, we’ve selected a few ginza examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github megagonlabs / ginza / ginza_util / conllu_to_json.py View on Github external
dep_outer_label = g['dep']
                head_pos = g['pos']
            elif dep_outer_id != head_id:
                return False
    if dep_outer_id is None:
        print(gold_tokens[start:end], file=sys.stderr)
        raise Exception('unexpected state')
    elif start < dep_outer_id < end:
        dep_outer_id = start

    g = gold_tokens[start]
    g['orth'] = replacing_token.orth_
    g['lemma'] = replacing_token.lemma_
    g['pos'] = replacing_token.pos_
    g['tag'] = replacing_token.tag_
    g['inf'] = ex_attr(replacing_token).inf
    g['whitespace'] = replacing_token.whitespace_ != ''
    g['head'] = dep_outer_id - start
    if dep_outer_label.startswith('as_'):
        g['dep'] = dep_outer_label
    else:
        dep = dep_outer_label.split('_as_')[0]
        g['dep'] = dep if not extend_dep_labels or head_pos == g['pos'] else '{}_as_{}'.format(dep, head_pos)

    for g in gold_tokens:
        if g['id'] <= start and end <= g['id'] + g['head']:
            g['head'] -= end - start - 1
        elif g['id'] <= start < g['id'] + g['head']:
            g['head'] = start - g['id']
        elif g['id'] + g['head'] <= start and end <= g['id']:
            g['head'] += end - start - 1
        elif g['id'] + g['head'] < end <= g['id']:
github megagonlabs / ginza / ginza / __init__.py View on Github external
class Japanese(Language):
    lang = "ja"
    Defaults = JapaneseDefaults
    Tokenizer = SudachiTokenizer

    def make_doc(self, text):
        return self.tokenizer(text)


def pickle_japanese(instance):
    return Japanese, tuple()


copy_reg.pickle(Japanese, pickle_japanese)


__all__ = [
    'Japanese',
]
github megagonlabs / ginza / ginza / __init__.py View on Github external
def pickle_japanese(instance):
    return Japanese, tuple()
github megagonlabs / ginza / ginza / command_line.py View on Github external
):
    if require_gpu:
        spacy.require_gpu()
        print("GPU enabled", file=sys.stderr)
    if model_path:
        nlp = spacy.load(model_path)
    else:
        nlp = spacy.load('ja_ginza')
    if disable_pipes:
        print("disabling pipes: {}".format(disable_pipes), file=sys.stderr)
        nlp.disable_pipes(disable_pipes)
        print("using : {}".format(nlp.pipe_names), file=sys.stderr)
    if recreate_corrector:
        if 'JapaneseCorrector' in nlp.pipe_names:
            nlp.remove_pipe('JapaneseCorrector')
        corrector = JapaneseCorrector(nlp)
        nlp.add_pipe(corrector, last=True)

    if mode == 'A':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.A
    elif mode == 'B':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.B
    elif mode == 'C':
        nlp.tokenizer.mode = OriginalTokenizer.SplitMode.C
    else:
        raise Exception('mode should be A, B or C')
    print("mode is {}".format(mode), file=sys.stderr)
    if not use_sentence_separator:
        print("disabling sentence separator", file=sys.stderr)
        nlp.tokenizer.use_sentence_separator = False

    if output_path:
github megagonlabs / ginza / ginza / __init__.py View on Github external
Language.factories['JapaneseCorrector'] = lambda nlp, **cfg: JapaneseCorrector(nlp)
github megagonlabs / ginza / ginza_util / util.py View on Github external
def create_model_path(output_dir, model_name, model_version):
    return output_dir / '{}_{}-{}'.format(Japanese.lang, model_name, model_version)
github megagonlabs / ginza / ginza / __init__.py View on Github external
def create_tokenizer(cls, nlp=None):
        return SudachiTokenizer(nlp)
github megagonlabs / ginza / ginza / sudachi_tokenizer.py View on Github external
last_morph = m
                else:
                    last_morph = m
        if last_morph:
            morph_spaces.append((last_morph, False))

        # the last space is removed by JapaneseReviser at the final stage of pipeline
        words = [m.surface() for m, spaces in morph_spaces]
        spaces = [space for m, space in morph_spaces]
        doc = Doc(self.nlp.vocab if self.nlp else Vocab(), words=words, spaces=spaces)
        next_tag = morph_tag(morph_spaces[0][0].part_of_speech()[0:4]) if len(doc) else ''
        for token, (morph, spaces) in zip(doc, morph_spaces):
            tag = next_tag
            next_tag = morph_tag(morph_spaces[token.i + 1][0].part_of_speech()[0:4]) if token.i < len(doc) - 1 else ''
            token.tag_ = tag
            token.pos = TAG_MAP[tag][POS]
            # TODO separate lexical rules to resource files
            if morph.normalized_form() == '為る' and tag == '動詞-非自立可能':
                token.pos_ = 'AUX'
            elif tag == '名詞-普通名詞-サ変可能':
                if next_tag == '動詞-非自立可能':
                    token.pos_ = 'VERB'
            elif tag == '名詞-普通名詞-サ変形状詞可能':
                if next_tag == '動詞-非自立可能':
                    token.pos_ = 'VERB'
                elif next_tag == '助動詞' or next_tag.find('形状詞') >= 0:
                    token.pos_ = 'ADJ'
            token.lemma_ = morph.normalized_form()
            token._.inf = ','.join(morph.part_of_speech()[4:])
            token._.reading = morph.reading_form()
            token._.sudachi = morph
        if self.use_sentence_separator:

ginza

GiNZA, An Open Source Japanese NLP Library, based on Universal Dependencies

MIT
Latest version published 7 months ago

Package Health Score

65 / 100
Full package analysis

Similar packages