How to use the sudachipy.utf8inputtextbuilder.UTF8InputTextBuilder function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / tests / plugin / test_prolongedsoundmarkinput.py View on Github external
def test_combine_continuous_prolonged_sound_marks_multi_symbol_types(self):
        original = 'エーービ〜〜〜シ〰〰〰〰'
        normalized = 'エービーシー'
        builder = UTF8InputTextBuilder(original, mocked_grammar)
        self.plugin.rewrite(builder)
        text = builder.build()

        self.assertEqual(original, text.original_text)
        self.assertEqual(normalized, text.get_text())
        bytes_ = text.get_byte_text()
        self.assertEqual(18, len(bytes_))

        self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_)
        self.assertEqual(0, text.get_original_index(0))
        self.assertEqual(1, text.get_original_index(3))
        self.assertEqual(3, text.get_original_index(6))
        self.assertEqual(4, text.get_original_index(9))
        self.assertEqual(7, text.get_original_index(12))
        self.assertEqual(8, text.get_original_index(15))
        self.assertEqual(12, text.get_original_index(18))
github WorksApplications / SudachiPy / tests / plugin / test_join_numeric_plugin.py View on Github external
def get_path(self, text: str):
        input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build()
        self.tokenizer._build_lattice(input_)
        path = self.tokenizer._lattice.get_best_path()
        self.plugin.rewrite(input_, path, self.tokenizer._lattice)
        self.tokenizer._lattice.clear()
        return path
github WorksApplications / SudachiPy / tests / plugin / test_default_input_text_plugin.py View on Github external
def setUp(self):
        self.builder = UTF8InputTextBuilder(self.original_text, mock_grammar.mocked_grammar)

        self.plugin = DefaultInputTextPlugin()

        try:
            self.plugin.set_up()
        except IOError:
            self.fail('no file')

        self.test_resources_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            os.pardir,
            'resources')
github WorksApplications / SudachiPy / tests / plugin / test_prolongedsoundmarkinput.py View on Github external
def test_combine_continuous_prolonged_sound_mark(self):
        original = 'ゴーール'
        normalized = 'ゴール'
        builder = UTF8InputTextBuilder(original, mocked_grammar)
        self.plugin.rewrite(builder)
        text = builder.build()

        self.assertEqual(original, text.original_text)
        self.assertEqual(normalized, text.get_text())
        bytes_ = text.get_byte_text()
        self.assertEqual(9, len(bytes_))

        self.assertEqual(b'\xe3\x82\xb4\xe3\x83\xbc\xe3\x83\xab', bytes_)
        self.assertEqual(0, text.get_original_index(0))
        self.assertEqual(1, text.get_original_index(3))
        self.assertEqual(3, text.get_original_index(6))
        self.assertEqual(4, text.get_original_index(9))
github WorksApplications / SudachiPy / tests / test_utf8inputtext.py View on Github external
b'0x32', b'0x33', b'0x34', b'0xE6', b'0xBC',
            b'0xA2', b'0xE5', b'0xAD', b'0x97', b'0xF0',
            b'0xA1', b'0x88', b'0xBD', b'0xE3', b'0x82',
            b'0xA2', b'0xEF', b'0xBD', b'0xBA', b'0xEF',
            b'0xBE', b'0x9E'
        ]

        self.input = None

        grammar = self.MockGrammar()
        char_category = dictionarylib.charactercategory.CharacterCategory()
        this_dir = os.path.dirname(os.path.abspath(__file__))
        char_category.read_character_definition(os.path.join(this_dir, 'resources/char.def'))
        grammar.set_character_category(char_category)

        self.builder = sudachipy.utf8inputtextbuilder.UTF8InputTextBuilder(self.TEXT, grammar)
github WorksApplications / SudachiPy / tests / plugin / test_join_katakana_oov_plugin.py View on Github external
def get_path(self, text: str):
        input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build()
        self.tokenizer._build_lattice(input_)
        path = self.tokenizer._lattice.get_best_path()
        self.plugin.rewrite(input_, path, self.tokenizer._lattice)
        self.tokenizer._lattice.clear()
        return path
github WorksApplications / SudachiPy / tests / plugin / test_prolongedsoundmarkinput.py View on Github external
def test_combine_continuous_prolonged_sound_marks_multi_times(self):
        original = 'エーービーーーシーーーー'
        normalized = 'エービーシー'
        builder = UTF8InputTextBuilder(original, mocked_grammar)
        self.plugin.rewrite(builder)
        text = builder.build()

        self.assertEqual(original, text.original_text)
        self.assertEqual(normalized, text.get_text())
        bytes_ = text.get_byte_text()
        self.assertEqual(18, len(bytes_))

        self.assertEqual(b'\xe3\x82\xa8\xe3\x83\xbc\xe3\x83\x93\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xbc', bytes_)
        self.assertEqual(0, text.get_original_index(0))
        self.assertEqual(1, text.get_original_index(3))
        self.assertEqual(3, text.get_original_index(6))
        self.assertEqual(4, text.get_original_index(9))
        self.assertEqual(7, text.get_original_index(12))
        self.assertEqual(8, text.get_original_index(15))
        self.assertEqual(12, text.get_original_index(18))
github megagonlabs / ginza / sudachipy / tokenizer.py View on Github external
def tokenize(self, mode, text):
        if not text:
            return []

        builder = utf8inputtextbuilder.UTF8InputTextBuilder(text, self.grammar)
        for plugin in self.input_text_plugins:
            plugin.rewrite(builder)
        input_ = builder.build()
        bytes_ = input_.get_byte_text()

        self.lattice.resize(len(bytes_))
        for i in range(len(bytes_)):
            if not input_.is_char_alignment(i) or not self.lattice.has_previous_node(i):
                continue
            iterator = self.lexicon.lookup(bytes_, i)
            has_words = True if iterator else False
            for word_id, end in iterator:
                n = latticenode.LatticeNode(self.lexicon,
                                            self.lexicon.get_left_id(word_id),
                                            self.lexicon.get_right_id(word_id),
                                            self.lexicon.get_cost(word_id),