How to use the konoha.data.token.Token function in konoha

To help you get started, we’ve selected a few konoha examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github himkt / tiny_tokenizer / konoha / word_tokenizers / mecab_tokenizer.py View on Github external
token = Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron,
                )
                return_result.append(token)
        else:
            for surface in parse_result.split(" "):
                return_result.append(Token(surface=surface))

        return return_result
github himkt / tiny_tokenizer / konoha / word_tokenizers / janome_tokenizer.py View on Github external
def tokenize(self, text: str) -> List[Token]:
        return_result = []
        parse_result = self._tokenizer.tokenize(text)

        if self._with_postag:
            for morph in parse_result:
                surface = morph.surface
                postag, postag2, postag3, postag4 = morph.part_of_speech.split(",")
                inflection = morph.infl_type
                conjugation = morph.infl_form
                base_form = morph.base_form
                yomi = morph.reading
                pron = morph.phonetic

                token = Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron,
                )
                return_result.append(token)

        else:
            for morph in parse_result:
                return_result.append(Token(surface=morph.surface))
github himkt / tiny_tokenizer / konoha / word_tokenizers / kytea_tokenizer.py View on Github external
response = self._tokenizer.getTagsToString(text)

            # FIXME Following dirty workaround is required to
            #       process inputs which include  itself
            #       (e.g. "私 は猫")
            response = response.replace("\\ ", "").replace("  ", " ")

            for elem in response.split(" ")[:-1]:
                # FIXME If input contains a character "/",
                #       KyTea outputs "//補助記号/・",
                #       which breaks the simple logic elem.split("/")
                pron, postag, surface = map(
                    lambda e: e[::-1], elem[::-1].split("/", maxsplit=2)
                )
                surface = surface.replace("", " ")
                tokens.append(Token(surface=surface, postag=postag, pron=pron))

        else:
            for surface in list(self._tokenizer.getWS(text)):
                tokens.append(Token(surface=surface))

        return tokens
github himkt / tiny_tokenizer / konoha / word_tokenizers / mecab_tokenizer.py View on Github external
if self._with_postag:
            for elem in parse_result.split("\n")[:-1]:
                (
                    surface,
                    postag,
                    postag2,
                    postag3,
                    postag4,
                    inflection,
                    conjugation,
                    base_form,
                    yomi,
                    pron,
                ) = self._parse_feature(elem)

                token = Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron,
                )
                return_result.append(token)
        else:
            for surface in parse_result.split(" "):
                return_result.append(Token(surface=surface))
github himkt / tiny_tokenizer / konoha / word_tokenizers / kytea_tokenizer.py View on Github external
#       (e.g. "私 は猫")
            response = response.replace("\\ ", "").replace("  ", " ")

            for elem in response.split(" ")[:-1]:
                # FIXME If input contains a character "/",
                #       KyTea outputs "//補助記号/・",
                #       which breaks the simple logic elem.split("/")
                pron, postag, surface = map(
                    lambda e: e[::-1], elem[::-1].split("/", maxsplit=2)
                )
                surface = surface.replace("", " ")
                tokens.append(Token(surface=surface, postag=postag, pron=pron))

        else:
            for surface in list(self._tokenizer.getWS(text)):
                tokens.append(Token(surface=surface))

        return tokens
github himkt / tiny_tokenizer / konoha / word_tokenizers / whitespace_tokenizer.py View on Github external
def tokenize(self, text: str) -> List[Token]:
        return [Token(surface=surface) for surface in text.split(" ")]
github himkt / tiny_tokenizer / konoha / word_tokenizers / janome_tokenizer.py View on Github external
surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron,
                )
                return_result.append(token)

        else:
            for morph in parse_result:
                return_result.append(Token(surface=morph.surface))

        return return_result
github himkt / tiny_tokenizer / konoha / word_tokenizers / sudachi_tokenizer.py View on Github external
for token in self._tokenizer.tokenize(text, self._mode):
            surface = token.surface()
            if self._with_postag:
                (
                    postag,
                    postag2,
                    postag3,
                    postag4,
                    inflection,
                    conjugation,
                ) = token.part_of_speech()
                base_form = token.dictionary_form()
                normalized_form = token.normalized_form()
                yomi = token.reading_form()
                result.append(
                    Token(
                        surface=surface,
                        postag=postag,
                        postag2=postag2,
                        postag3=postag3,
                        postag4=postag4,
                        inflection=inflection,
                        conjugation=conjugation,
                        base_form=base_form,
                        normalized_form=normalized_form,
                        yomi=yomi,
                    )
                )

            else:
                result.append(Token(surface=surface))
        return result
github himkt / tiny_tokenizer / konoha / word_tokenizers / sentencepiece_tokenizer.py View on Github external
def tokenize(self, text: str) -> List[Token]:
        result = []
        for subword in self._tokenizer.EncodeAsPieces(text):
            token = Token(surface=subword)
            result.append(token)
        return result
github himkt / tiny_tokenizer / konoha / word_tokenizers / sudachi_tokenizer.py View on Github external
Token(
                        surface=surface,
                        postag=postag,
                        postag2=postag2,
                        postag3=postag3,
                        postag4=postag4,
                        inflection=inflection,
                        conjugation=conjugation,
                        base_form=base_form,
                        normalized_form=normalized_form,
                        yomi=yomi,
                    )
                )

            else:
                result.append(Token(surface=surface))
        return result