How to use the sudachipy.latticenode.LatticeNode function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / sudachipy / morphemelist.py View on Github external
def split(self, mode, index, wi):
        if mode is tokenizer.Tokenizer.SplitMode.A:
            word_ids = wi.a_unit_split
        elif mode is tokenizer.Tokenizer.SplitMode.B:
            word_ids = wi.b_unit_split
        else:
            return [self.__getitem__(index)]

        if len(word_ids) == 0 or len(word_ids) == 1:
            return [self.__getitem__(index)]

        offset = self.path[index].get_begin()
        nodes = []
        for wid in word_ids:
            n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
            n.set_begin(offset)
            offset += n.get_word_info().head_word_length
            n.set_end(offset)
            nodes.append(n)

        return MorphemeList(self.input_text, self.grammar, self.lexicon, nodes)
github megagonlabs / ginza / sudachipy / tokenizer.py View on Github external
return []

        builder = utf8inputtextbuilder.UTF8InputTextBuilder(text, self.grammar)
        for plugin in self.input_text_plugins:
            plugin.rewrite(builder)
        input_ = builder.build()
        bytes_ = input_.get_byte_text()

        self.lattice.resize(len(bytes_))
        for i in range(len(bytes_)):
            if not input_.is_char_alignment(i) or not self.lattice.has_previous_node(i):
                continue
            iterator = self.lexicon.lookup(bytes_, i)
            has_words = True if iterator else False
            for word_id, end in iterator:
                n = latticenode.LatticeNode(self.lexicon,
                                            self.lexicon.get_left_id(word_id),
                                            self.lexicon.get_right_id(word_id),
                                            self.lexicon.get_cost(word_id),
                                            word_id)
                self.lattice.insert(i, end, n)

            # OOV
            if categorytype.CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
                for oov_plugin in self.oov_provider_plugins:
                    for node in oov_plugin.get_oov(input_, i, has_words):
                        has_words = True
                        self.lattice.insert(node.get_begin(), node.get_end(), node)
            if not has_words and self.default_oov_provider:
                for node in self.default_oov_provider.get_oov(input_, i, has_words):
                    has_words = True
                    self.lattice.insert(node.get_begin(), node.get_end(), node)
github megagonlabs / ginza / sudachipy / tokenizer.py View on Github external
for plugin in self.path_rewrite_plugins:
            plugin.rewrite(input_, path, self.lattice)

        if mode is not self.SplitMode.C:
            new_path = []
            for node in path:
                if mode is self.SplitMode.A:
                    wids = node.get_word_info().a_unit_split
                else:  # self.SplitMode.B
                    wids = node.get_word_info().b_unit_split
                if len(wids) == 0 or len(wids) == 1:
                    new_path.append(node)
                else:
                    offset = node.get_begin()
                    for wid in wids:
                        n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
                        n.begin = offset
                        offset += n.get_word_info().head_word_length
                        n.end = offset
                        new_path.append(n)
            path = new_path

        # dump_output

        ml = morphemelist.MorphemeList(input_, self.grammar, self.lexicon, path)
        return ml
github megagonlabs / ginza / sudachipy / plugin / oov / simple_oov_plugin.py View on Github external
def create_node(self):
        node = latticenode.LatticeNode()
        node.set_oov()
        return node
github WorksApplications / SudachiPy / sudachipy / lattice.py View on Github external
def create_node() -> LatticeNode:
        return LatticeNode()
github megagonlabs / ginza / sudachipy / plugin / oov / mecab_oov_plugin.py View on Github external
def create_node(self):
        node = latticenode.LatticeNode()
        node.set_oov()
        return node
github WorksApplications / SudachiPy / sudachipy / lattice.py View on Github external
def resize(self, size: int) -> None:
        if size > self.capacity:
            self.expand(size)
        self.size = size
        self.eos_node = LatticeNode()
        self.eos_node.set_parameter(self.eos_params[0], self.eos_params[1], self.eos_params[2])
        self.eos_node.begin = self.eos_node.end = size
github megagonlabs / ginza / sudachipy / lattice.py View on Github external
def resize(self, size):
        if size > self.capacity:
            self.expand(size)
        self.size = size

        self.eos_node = latticenode.LatticeNode()
        self.eos_node.set_parameter(self.eos_params[0], self.eos_params[1], self.eos_params[2])
        self.eos_node.begin = self.eos_node.end = size
github WorksApplications / SudachiPy / sudachipy / tokenizer.py View on Github external
def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeNode]:
        if mode == self.SplitMode.C:
            return path
        new_path = []
        for node in path:
            if mode is self.SplitMode.A:
                wids = node.get_word_info().a_unit_split
            else:
                wids = node.get_word_info().b_unit_split
            if len(wids) <= 1:
                new_path.append(node)
            else:
                offset = node.get_begin()
                for wid in wids:
                    n = LatticeNode(self._lexicon, 0, 0, 0, wid)
                    n.begin = offset
                    offset += n.get_word_info().head_word_length
                    n.end = offset
                    new_path.append(n)
        return new_path