How to use the sudachipy.dictionarylib function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
if filename is None:
            raise AttributeError("system dictionary is not specified")
        with open(filename, 'r+b') as system_dic:
            bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
        buffers.append(bytes_)

        offset = 0
        header = dictionarylib.dictionaryheader.DictionaryHeader.from_bytes(bytes_, offset)
        if header.version != SYSTEM_DICT_VERSION:
            raise Exception("invalid system dictionary")
        offset += header.storage_size()

        grammar = dictionarylib.grammar.Grammar(bytes_, offset)
        offset += grammar.get_storage_size()

        lexicon = dictionarylib.lexiconset.LexiconSet(dictionarylib.doublearraylexicon.DoubleArrayLexicon(bytes_, offset))
        return buffers, header, grammar, lexicon
github WorksApplications / SudachiPy / tests / test_utf8inputtext.py View on Github external
def test_get_char_category_types(self):
        input_ = self.builder.build()
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(0))
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(2))
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(5))
        self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(6))
        self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(7))
        self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(9))
        self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(10))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(13))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(18))
        self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(19))
        self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(22))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(23))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(26))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(31))
github megagonlabs / ginza / sudachipy / dictionary.py View on Github external
def read_character_definition(self, filename):
        if self.grammar is None:
            return
        char_category = dictionarylib.charactercategory.CharacterCategory()
        char_category.read_character_definition(filename)
        self.grammar.set_character_category(char_category)
github WorksApplications / SudachiPy / sudachipy / dictionary.py View on Github external
def _read_character_definition(self, filename):
        if self.grammar is None:
            return
        char_category = dictionarylib.charactercategory.CharacterCategory()
        char_category.read_character_definition(filename)
        self.grammar.set_character_category(char_category)
github megagonlabs / ginza / sudachipy / lattice.py View on Github external
def connect_node(self, r_node):
        begin = r_node.begin
        r_node.total_cost = float('inf')
        for l_node in self.end_lists[begin]:
            if not l_node.is_connected_to_bos:
                continue
            connect_cost = self.grammar.get_connect_cost(l_node.right_id, r_node.left_id)
            if connect_cost == dictionarylib.grammar.Grammar.INHIBITED_CONNECTION:
                continue
            cost = l_node.total_cost + connect_cost
            if cost < r_node.total_cost:
                r_node.total_cost = cost
                r_node.best_previous_node = l_node

        r_node.is_connected_to_bos = not (r_node.best_previous_node is None)
        r_node.total_cost += r_node.cost
github megagonlabs / ginza / sudachipy / dictionary.py View on Github external
if filename is None:
            raise AttributeError("system dictionary is not specified")
        with open(filename, 'r+b') as system_dic:
            bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
        self.buffers.append(bytes_)

        offset = 0
        self.header = dictionarylib.dictionaryheader.DictionaryHeader.from_bytes(bytes_, offset)
        if self.header.version != DictionaryVersion.SYSTEM_DICT_VERSION:
            raise Exception("invalid system dictionary")
        offset += self.header.storage_size()

        self.grammar = dictionarylib.grammar.Grammar(bytes_, offset)
        offset += self.grammar.get_storage_size()

        self.lexicon = dictionarylib.lexiconset.LexiconSet(dictionarylib.doublearraylexicon.DoubleArrayLexicon(bytes_, offset))
github megagonlabs / ginza / sudachipy / command_line.py View on Github external
if filename is None:
        raise AttributeError("system dictionary is not specified")
    with open(filename, 'r+b') as system_dic:
        bytes_ = mmap.mmap(system_dic.fileno(), 0, access=mmap.ACCESS_READ)
    buffers.append(bytes_)

    offset = 0
    header = DictionaryHeader.from_bytes(bytes_, offset)
    if header.version != DictionaryVersion.SYSTEM_DICT_VERSION:
        raise Exception("invalid system dictionary")
    offset += header.storage_size()

    grammar = dictionarylib.grammar.Grammar(bytes_, offset)
    offset += grammar.get_storage_size()

    lexicon = dictionarylib.lexiconset.LexiconSet(dictionarylib.doublearraylexicon.DoubleArrayLexicon(bytes_, offset))
    return buffers, header, grammar, lexicon