How to use the sudachipy.dictionarylib.dictionarybuilder.DictionaryBuilder function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parseline(self):
        builder = DictionaryBuilder(logger=self.logger)
        entry = builder.parse_line(
            '京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertEqual('京都', entry.headword)
        self.assertEqual([6, 6, 5293], entry.parameters)
        self.assertEqual(0, entry.wordinfo.pos_id)
        self.assertEqual('*', entry.aunit_split_string)
        self.assertEqual('*', entry.bunit_split_string)

        entry = builder.parse_line(
            '京都,-1,-1,0,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertIsNone(entry.headword)
        self.assertEqual(0, entry.wordinfo.pos_id)
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_line_empty_headword(self):
        builder = DictionaryBuilder(logger=self.logger)
        with self.assertRaises(ValueError) as cm:
            builder.parse_line(',6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertEqual('headword is empty', cm.exception.args[0])
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_line_same_readingform(self):
        builder = DictionaryBuilder(logger=self.logger)
        entry = builder.parse_line('〒,6,6,5293,〒,名詞,普通名詞,一般,*,*,*,〒,〒,*,A,*,*,*'.split(','))
        self.assertEqual('〒', entry.wordinfo.reading_form)
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_write_intarray(self):
        builder = DictionaryBuilder(logger=self.logger)
        position = builder.byte_buffer.tell()
        builder.write_intarray([])
        self.assertEqual(0, builder.byte_buffer.getvalue()[position])
        builder.write_intarray([1, 2, 3])
        self.assertEqual(3, builder.byte_buffer.getvalue()[position + 1])
        self.assertEqual(1, int.from_bytes(builder.byte_buffer.getvalue()[position + 2:position + 6], byteorder='little', signed=True))
        self.assertEqual(2, int.from_bytes(builder.byte_buffer.getvalue()[position + 6:position + 10], byteorder='little', signed=True))
        self.assertEqual(3, int.from_bytes(builder.byte_buffer.getvalue()[position + 10:position + 14], byteorder='little', signed=True))
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_splitinfo(self):
        builder = DictionaryBuilder(logger=self.logger)
        builder.entries.extend([None] * 4)
        self.assertEqual([], builder.parse_splitinfo('*'))
        self.assertEqual([1, 2, 3], builder.parse_splitinfo('1/2/3'))
        self.assertEqual(2, builder.parse_splitinfo('1/U2/3')[1])

        mocked_lexicon = mock.Mock(spec=Lexicon)
        mocked_lexicon.size.return_value = 4
        builder = UserDictionaryBuilder(None, mocked_lexicon)
        builder.entries += [None, None, None]
        self.assertEqual([1, 2 | 1 << 28, 3], builder.parse_splitinfo("1/U2/3"))
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_splitinfo_invalid_wordid(self):
        builder = DictionaryBuilder(logger=self.logger)
        with self.assertRaises(ValueError) as cm:
            builder.parse_splitinfo('1/2/3')
        self.assertEqual('invalid word ID', cm.exception.args[0])
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_write_string(self):
        builder = DictionaryBuilder(logger=self.logger)
        position = builder.byte_buffer.tell()
        builder.write_string('')
        self.assertEqual(0, builder.byte_buffer.getvalue()[position])
        self.assertEqual(position + 1, builder.byte_buffer.tell())

        position = builder.byte_buffer.tell()
        builder.write_string('あ𠮟')
        self.assertEqual(3, builder.byte_buffer.getvalue()[position])
        self.assertEqual('あ', builder.byte_buffer.getvalue()[position + 1: position + 3].decode('utf-16-le'))
        a = int.from_bytes(builder.byte_buffer.getvalue()[position + 3: position + 5], byteorder='little')
        b = int.from_bytes(builder.byte_buffer.getvalue()[position + 5: position + 7], byteorder='little')
        self.assertEqual(55362, a)  # \ud842
        self.assertEqual(57247, b)  # \udf94

        position = builder.byte_buffer.tell()
        long_str = '0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789'
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_build(self):
        out_path = os.path.join(self.test_dir, 'output.txt')

        out_stream = open(out_path, 'wb')
        lexicon_paths = [self.input_path]
        matrix_input_stream = open(self.matrix_path, 'r', encoding='utf-8')

        header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()), 'test')
        out_stream.write(header.to_bytes())
        builder = DictionaryBuilder(logger=self.logger)
        builder.build(lexicon_paths, matrix_input_stream, out_stream)
        out_stream.close()
        matrix_input_stream.close()

        buffers, header, grammar, lexicon_set = self.read_system_dictionary(out_path)
        lexicon = lexicon_set.lexicons[0]

        # header
        self.assertEqual(SYSTEM_DICT_VERSION, header.version)
        self.assertEqual('test', header.description)

        # grammar
        self.assertEqual(2, grammar.get_part_of_speech_size())
        self.assertEqual(["名詞", "固有名詞", "地名", "一般", "*", "*"], grammar.get_part_of_speech_string(0))
        self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"], grammar.get_part_of_speech_string(1))
        self.assertEqual(200, grammar.get_connect_cost(0, 0))
github megagonlabs / ginza / sudachipy / command_line.py View on Github external
def _command_build(args, print_usage):
    _matrix_file_checker(args, print_usage)
    _input_files_checker(args, print_usage)
    header = DictionaryHeader(
        DictionaryVersion.SYSTEM_DICT_VERSION, int(time.time()), args.description)
    with open(args.out_file, 'wb') as wf, open(args.matrix_file, 'r') as rf:
        wf.write(header.to_bytes())
        builder = DictionaryBuilder()
        builder.build(args.in_files, rf, wf)
github WorksApplications / SudachiPy / sudachipy / command_line.py View on Github external
def _command_build(args, print_usage):
    _matrix_file_checker(args, print_usage)
    _input_files_checker(args, print_usage)
    header = DictionaryHeader(
        SYSTEM_DICT_VERSION, int(time.time()), args.description)
    with open(args.out_file, 'wb') as wf, open(args.matrix_file, 'r') as rf:
        wf.write(header.to_bytes())
        builder = DictionaryBuilder()
        builder.build(args.in_files, rf, wf)