How to use the sudachipy.dictionarylib.categorytype.CategoryType function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / sudachipy / dictionarylib / charactercategory.py View on Github external
n = len(self.range_list)
        end = n
        pivot = (begin + end) // 2
        while 0 <= pivot < n:
            range_ = self.range_list[pivot]
            if range_.contains(code_point):
                return range_.categories
            if range_.lower(code_point):
                begin = pivot
            else:  # range_.higher(code_point)
                end = pivot
            new_pivot = (begin + end) // 2
            if new_pivot == pivot:
                break
            pivot = new_pivot
        return {categorytype.CategoryType.DEFAULT}
github WorksApplications / SudachiPy / tests / test_utf8inputtext.py View on Github external
def test_get_char_category_types(self):
        input_ = self.builder.build()
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(0))
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(2))
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(5))
        self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(6))
        self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(7))
        self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(9))
        self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(10))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(13))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(18))
        self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(19))
        self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(22))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(23))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(26))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(31))
github WorksApplications / SudachiPy / tests / plugin / test_mecab_oov_plugin.py View on Github external
def test_read_oov(self):
        oov = os.path.join(self.test_dir, 'test.txt')
        with open(oov, 'w') as wf:
            wf.write("DEFAULT,1,2,3,補助記号,一般,*,*,*,*\n")
            wf.write("DEFAULT,3,4,5,補助記号,一般,*,*,*,*\n")
        plugin = MeCabOovPlugin()
        plugin.categories[CategoryType.DEFAULT] = MeCabOovPlugin.CategoryInfo()
        plugin.read_oov(oov, mock_grammar.mocked_grammar)
        self.assertEqual(1, len(plugin.oov_list))
        self.assertEqual(2, len(plugin.oov_list[CategoryType.DEFAULT]))
        self.assertEqual(1, plugin.oov_list[CategoryType.DEFAULT][0].left_id)
        self.assertEqual(2, plugin.oov_list[CategoryType.DEFAULT][0].right_id)
        self.assertEqual(3, plugin.oov_list[CategoryType.DEFAULT][0].cost)
        self.assertEqual(0, plugin.oov_list[CategoryType.DEFAULT][0].pos_id)
github WorksApplications / SudachiPy / sudachipy / dictionarylib / charactercategory.py View on Github external
raise AttributeError("invalid format at line {}".format(i))
            if not re.match("0x", cols[0]):
                continue
            range_ = self.Range()
            r = re.split("\\.\\.", cols[0])
            range_.low = int(r[0], 16)
            range_.high = range_.low + 1
            if len(r) > 1:
                range_.high = int(r[1], 16) + 1
            if range_.low >= range_.high:
                f.close()
                raise AttributeError("invalid range at line {}".format(i))
            for j in range(1, len(cols)):
                if re.match("#", cols[j]) or cols[j] == '':
                    break
                type_ = categorytype.CategoryType.get(cols[j])
                if type_ is None:
                    f.close()
                    raise AttributeError("{} is invalid type at line {}".format(cols[j], i))
                range_.categories.append(type_)
            self.range_list.append(range_)

        f.close()
        self._compile()
github WorksApplications / SudachiPy / sudachipy / dictionarylib / categorytype.py View on Github external
def get_type(self, id_):
        for type_ in CategoryType.values():
            if type_.get_id() is id_:
                return type_
        return None
github WorksApplications / SudachiPy / sudachipy / plugin / path_rewrite / join_katakana_oov_plugin.py View on Github external
def can_oov_bow_node(self, text, node):
        return CategoryType.NOOOVBOW not in text.get_char_category_types(node.get_begin())
github WorksApplications / SudachiPy / sudachipy / plugin / path_rewrite / join_numeric_plugin.py View on Github external
def rewrite(self, text, path, lattice):
        begin_index = -1
        comma_as_digit = True
        period_as_digit = True
        parser = NumericParser()
        i = -1

        while i < len(path) - 1:
            i += 1
            node = path[i]
            types = self.get_char_category_types(text, node)
            s = node.get_word_info().normalized_form
            if CategoryType.NUMERIC in types or CategoryType.KANJINUMERIC in types or \
               (period_as_digit and s == '.') or (comma_as_digit and s == ','):

                if begin_index < 0:
                    parser.clear()
                    begin_index = i

                for c in s:
                    if not parser.append(c):
                        if begin_index >= 0:
                            if parser.error_state == NumericParser.Error.COMMA:
                                comma_as_digit = False
                                i = begin_index - 1
                            elif parser.error_state == NumericParser.Error.POINT:
                                period_as_digit = False
                                i = begin_index - 1
                            begin_index = -1
github megagonlabs / ginza / sudachipy / dictionarylib / charactercategory.py View on Github external
def get_category_types(self, code_point):
        for range_ in self.range_list:
            if range_.contains(code_point):
                return range_.categories
        return set(categorytype.CategoryType.DEFAULT)
github WorksApplications / SudachiPy / sudachipy / tokenizer.py View on Github external
continue
            iterator = self._lexicon.lookup(bytes_, i)
            has_words = False
            for word_id, end in iterator:
                if (end < len(bytes_)) and (not input_.can_bow(end)):
                    continue
                has_words = True
                n = LatticeNode(self._lexicon,
                                self._lexicon.get_left_id(word_id),
                                self._lexicon.get_right_id(word_id),
                                self._lexicon.get_cost(word_id),
                                word_id)
                self._lattice.insert(i, end, n)

            # OOV
            if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
                for oov_plugin in self._oov_provider_plugins:
                    for node in oov_plugin.get_oov(input_, i, has_words):
                        has_words = True
                        self._lattice.insert(node.get_begin(), node.get_end(), node)
            if not has_words and self.default_oov_provider:
                for node in self.default_oov_provider.get_oov(input_, i, has_words):
                    has_words = True
                    self._lattice.insert(node.get_begin(), node.get_end(), node)

            if not has_words:
                raise RuntimeError("there is no morpheme at " + str(i))
        self._lattice.connect_eos_node()