How to use the sudachipy.dictionarylib.wordinfo.WordInfo function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github megagonlabs / ginza / sudachipy / plugin / oov / simple_oov_plugin.py View on Github external
def provide_oov(self, input_text, offset, has_other_words):
        if not has_other_words:
            node = self.create_node()
            node.set_parameter(self.left_id, self.right_id, self.cost)
            length = input_text.get_code_points_offset_length(offset, 1)
            s = input_text.get_substring(offset, offset + length)
            info = wordinfo.WordInfo(surface=s, head_word_length=length, pos_id=self.oov_pos_id, normalized_form=s,
                                     dictionary_form_word_id=-1, dictionary_form=s, reading_form="",
                                     a_unit_split=[], b_unit_split=[], word_structure=[])
            node.set_word_info(info)
            return [node]
        else:
            return []
github WorksApplications / SudachiPy / sudachipy / plugin / oov / simple_oov_plugin.py View on Github external
def provide_oov(self, input_text, offset, has_other_words):
        if not has_other_words:
            node = self.create_node()
            node.set_parameter(self.left_id, self.right_id, self.cost)
            length = input_text.get_word_candidate_length(offset)
            s = input_text.get_substring(offset, offset + length)
            info = wordinfo.WordInfo(surface=s, head_word_length=length, pos_id=self.oov_pos_id, normalized_form=s,
                                     dictionary_form_word_id=-1, dictionary_form=s, reading_form="",
                                     a_unit_split=[], b_unit_split=[], word_structure=[])
            node.set_word_info(info)
            return [node]
        else:
            return []
github megagonlabs / ginza / sudachipy / dictionarylib / wordinfolist.py View on Github external
if not normalized_form:
            normalized_form = surface
        dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True)
        reading_form = self.buffer_to_string()
        a_unit_split = self.buffer_to_int_array()
        b_unit_split = self.buffer_to_int_array()
        word_structure = self.buffer_to_int_array()

        dictionary_form = surface
        if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id:
            wi = self.get_word_info(dictionary_form_word_id)
            dictionary_form = wi.surface

        self.bytes.seek(orig_pos)

        return wordinfo.WordInfo(surface, head_word_length, pos_id, normalized_form,
                                 dictionary_form_word_id, dictionary_form, reading_form,
                                 a_unit_split, b_unit_split, word_structure)
github megagonlabs / ginza / sudachipy / plugin / oov / mecab_oov_plugin.py View on Github external
def get_oov_node(self, text, oov, length):
        node = self.create_node()
        node.set_parameter(oov.left_id, oov.right_id, oov.cost)
        info = wordinfo.WordInfo(surface=text, head_word_length=length, pos_id=oov.pos_id, normalized_form=text,
                                 dictionary_form_word_id=-1, dictionary_form=text, reading_form="",
                                 a_unit_split=[], b_unit_split=[], word_structure=[])
        node.set_word_info(info)
        return node
github WorksApplications / SudachiPy / sudachipy / dictionarylib / dictionarybuilder.py View on Github external
raise ValueError('invalid part of speech')

        entry.aunit_split_string = cols[15]
        entry.bunit_split_string = cols[16]
        entry.cunit_split_string = cols[17]
        self.check_splitinfo_format(entry.aunit_split_string)
        self.check_splitinfo_format(entry.bunit_split_string)
        self.check_splitinfo_format(entry.cunit_split_string)

        if cols[14] == 'A' and \
                not (entry.aunit_split_string == '*' and entry.bunit_split_string == '*'):
            raise ValueError('invalid splitting')

        head_length = len(cols[0].encode('utf-8'))
        dict_from_wordid = -1 if cols[13] == '*' else int(cols[13])
        entry.wordinfo = WordInfo(
            cols[4], head_length, pos_id, cols[12], dict_from_wordid, '', cols[11], None, None, None)
        return entry
github WorksApplications / SudachiPy / sudachipy / plugin / path_rewrite / path_rewrite_plugin.py View on Github external
def concatenate_oov(self, path, begin, end, pos_id, lattice):
        if begin >= end:
            raise IndexError("begin >= end")
        b = path[begin].get_begin()
        e = path[end - 1].get_end()
        surface = ""
        length = 0
        for i in range(begin, end):
            info = path[i].get_word_info()
            surface += info.surface
            length += info.head_word_length

        wi = WordInfo(surface=surface, head_word_length=length, pos_id=pos_id,
                      normalized_form=surface, dictionary_form=surface, dictionary_form_word_id=-1,
                      reading_form="", a_unit_split=[], b_unit_split=[], word_structure=[])

        node = lattice.create_node()
        node.set_range(b, e)
        node.set_word_info(wi)

        path[begin:end] = [node]
        return node