How to use the sudachipy.tokenizer function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / tests / test_tokenizer.py View on Github external
def test_tokenizer_morpheme_split(self):
        from sudachipy import tokenizer
        ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
        self.assertEqual(1, ms.size())
        self.assertEqual(ms[0].surface(), '東京都')

        ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
        self.assertEqual(2, ms_a.size())
        self.assertEqual(ms_a[0].surface(), '東京')
        self.assertEqual(ms_a[1].surface(), '都')
github WorksApplications / SudachiPy / tests / test_tokenizer.py View on Github external
def test_tokenizer_morpheme_split(self):
        from sudachipy import tokenizer
        ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
        self.assertEqual(1, ms.size())
        self.assertEqual(ms[0].surface(), '東京都')

        ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
        self.assertEqual(2, ms_a.size())
        self.assertEqual(ms_a[0].surface(), '東京')
        self.assertEqual(ms_a[1].surface(), '都')
github WorksApplications / SudachiPy / sudachipy / morphemelist.py View on Github external
def split(self, mode, index, wi):
        if mode is tokenizer.Tokenizer.SplitMode.A:
            word_ids = wi.a_unit_split
        elif mode is tokenizer.Tokenizer.SplitMode.B:
            word_ids = wi.b_unit_split
        else:
            return [self.__getitem__(index)]

        if len(word_ids) == 0 or len(word_ids) == 1:
            return [self.__getitem__(index)]

        offset = self.path[index].get_begin()
        nodes = []
        for wid in word_ids:
            n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
            n.set_begin(offset)
            offset += n.get_word_info().head_word_length
            n.set_end(offset)
            nodes.append(n)
github WorksApplications / SudachiPy / sudachipy / command_line.py View on Github external
def _command_tokenize(args, print_usage):
    if args.version:
        print_version()
        return

    _input_files_checker(args, print_usage)

    if args.mode == "A":
        mode = tokenizer.Tokenizer.SplitMode.A
    elif args.mode == "B":
        mode = tokenizer.Tokenizer.SplitMode.B
    else:
        mode = tokenizer.Tokenizer.SplitMode.C

    stdout_logger = logging.getLogger(__name__)
    output = sys.stdout
    if args.fpath_out:
        output = open(args.fpath_out, "w", encoding="utf-8")
    handler = logging.StreamHandler(output)
    handler.setLevel(logging.DEBUG)
    stdout_logger.addHandler(handler)
    stdout_logger.setLevel(logging.DEBUG)
    stdout_logger.propagate = False

    print_all = args.a
    enable_dump = args.d
github megagonlabs / ginza / sudachipy / dictionary.py View on Github external
def create(self):
        return tokenizer.Tokenizer(self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, self.path_rewrite_plugins)
github megagonlabs / ginza / sudachipy / command_line.py View on Github external
def _command_tokenize(args, print_usage):

    with open(args.fpath_setting, "r", encoding="utf-8") as f:
        settings = json.load(f)

    if args.mode == "A":
        mode = tokenizer.Tokenizer.SplitMode.A
    elif args.mode == "B":
        mode = tokenizer.Tokenizer.SplitMode.B
    else:
        mode = tokenizer.Tokenizer.SplitMode.C

    output = sys.stdout
    if args.fpath_out:
        output = open(args.fpath_out, "w", encoding="utf-8")

    print_all = args.a

    is_enable_dump = args.d

    dict_ = dictionary.Dictionary(settings)
    tokenizer_obj = dict_.create()
    if is_enable_dump:
        tokenizer_obj.set_dump_output(output)

    input_ = fileinput.input(args.input_files, openhook=fileinput.hook_encoded("utf-8"))
    run(tokenizer_obj, mode, input_, output, print_all)
github megagonlabs / ginza / ginza / sudachi_tokenizer.py View on Github external
def try_import_sudachipy_split_mode():
    try:
        from sudachipy import tokenizer
        return tokenizer.Tokenizer.SplitMode
    except ImportError:
        raise ImportError(
            "Japanese support requires SudachiPy distributed with ja language model"
        )
github megagonlabs / ginza / sudachipy / dictionary.py View on Github external
def read_user_dictionary(self, filename):
        with open(filename, 'r+b') as user_dic:
            bytes_ = mmap.mmap(user_dic.fileno(), 0, prot=mmap.PROT_READ)
        self.buffers.append(bytes_)

        user_lexicon = dictionarylib.doublearraylexicon.DoubleArrayLexicon(bytes_, 0)
        tokenizer_ = tokenizer.JapaneseTokenizer(self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, [])
        user_lexicon.calclate_cost(tokenizer_)
        self.lexicon.append(user_lexicon)
github himkt / tiny_tokenizer / konoha / word_tokenizers / sudachi_tokenizer.py View on Github external
name="sudachi ({})".format(mode), with_postag=with_postag,
        )
        try:
            self._tokenizer = dictionary.Dictionary().create()
        except KeyError:
            msg = "Loading a dictionary fails."
            msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )"  # NOQA
            raise KeyError(msg)

        _mode = mode.capitalize()
        if _mode == "A":
            self._mode = tokenizer.Tokenizer.SplitMode.A
        elif _mode == "B":
            self._mode = tokenizer.Tokenizer.SplitMode.B
        elif _mode == "C":
            self._mode = tokenizer.Tokenizer.SplitMode.C
        else:
            raise ValueError("Invalid mode is specified. Mode should be A, B, or C.")  # NOQA
github megagonlabs / ginza / sudachipy / command_line.py View on Github external
def _command_tokenize(args, print_usage):

    with open(args.fpath_setting, "r", encoding="utf-8") as f:
        settings = json.load(f)

    if args.mode == "A":
        mode = tokenizer.Tokenizer.SplitMode.A
    elif args.mode == "B":
        mode = tokenizer.Tokenizer.SplitMode.B
    else:
        mode = tokenizer.Tokenizer.SplitMode.C

    output = sys.stdout
    if args.fpath_out:
        output = open(args.fpath_out, "w", encoding="utf-8")

    print_all = args.a

    is_enable_dump = args.d

    dict_ = dictionary.Dictionary(settings)
    tokenizer_obj = dict_.create()
    if is_enable_dump: