How to use the konoha.word_tokenizers.tokenizer.BaseTokenizer function in konoha

To help you get started, weโ€™ve selected a few konoha examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github himkt / tiny_tokenizer / konoha / word_tokenizers / whitespace_tokenizer.py View on Github external
from typing import List

from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class WhitespaceTokenizer(BaseTokenizer):
    """Simple rule-based word tokenizer."""

    def __init__(self) -> None:
        super(WhitespaceTokenizer, self).__init__("whitespace")

    def tokenize(self, text: str) -> List[Token]:
        return [Token(surface=surface) for surface in text.split(" ")]
github himkt / tiny_tokenizer / konoha / word_tokenizers / sentencepiece_tokenizer.py View on Github external
from typing import List

from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class SentencepieceTokenizer(BaseTokenizer):
    """Wrapper class forSentencepiece"""

    def __init__(self, model_path: str, **kwargs) -> None:
        """
        Initializer for SentencepieceTokenizer.

        Parameters
        ---
        model_path (str)
            path to sentencepiece model.
        **kwargs
            others.
        """

        try:
            import sentencepiece
github himkt / tiny_tokenizer / konoha / word_tokenizers / mecab_tokenizer.py View on Github external
return (
        surface,
        postag,
        postag2,
        postag3,
        postag4,
        inflection,
        conjugation,
        base_form,
        yomi,
        pron,
    )


class MeCabTokenizer(BaseTokenizer):
    """Wrapper class forexternal text analyzers"""

    def __init__(
        self,
        user_dictionary_path: Optional[str] = None,
        system_dictionary_path: Optional[str] = None,
        dictionary_format: Optional[str] = None,
        with_postag: bool = False,
    ) -> None:
        """
        Initializer for MeCabTokenizer.

        Parameters
        ---
        dictionary_path (Optional[str]=None)
            path to a custom dictionary (option)
github himkt / tiny_tokenizer / konoha / word_tokenizers / sudachi_tokenizer.py View on Github external
from typing import List

from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class SudachiTokenizer(BaseTokenizer):
    """Wrapper class for SudachiPy."""

    def __init__(self, mode: str, with_postag: bool, **kwargs) -> None:
        """
        Initializer for SudachiTokenizer.

        Parameters
        ---
        mode (str)
            Splitting mode which controls a granuality ofkonoha.token.
            (mode should be `A`, `B` or `C`)
            For more information, see following links.
            - document: https://github.com/WorksApplications/Sudachi#the-modes-of-splitting  # NOQA
            - paper: http://www.lrec-conf.org/proceedings/lrec2018/summaries/8884.html  # NOQA
        with_postag (bool=False)
            flag determines ifkonoha.tokenizer include pos tags.
github himkt / tiny_tokenizer / konoha / word_tokenizers / character_tokenizer.py View on Github external
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class CharacterTokenizer(BaseTokenizer):
    """Characterkonoha.tokenizer"""

    def __init__(self):
        super(CharacterTokenizer, self).__init__("character")

    def tokenize(self, text: str):
        return [Token(surface=char) for char in list(text)]
github himkt / tiny_tokenizer / konoha / word_tokenizers / kytea_tokenizer.py View on Github external
from typing import List
from typing import Optional

from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class KyTeaTokenizer(BaseTokenizer):
    """Wrapper class forKyTea"""

    def __init__(
        self, with_postag: bool = False, model_path: Optional[str] = None, **kwargs
    ) -> None:

        super(KyTeaTokenizer, self).__init__(
            name="kytea", with_postag=with_postag, model_path=model_path
        )
        try:
            import Mykytea
        except ImportError:
            msg = "Importing kytea failed for some reason."
            msg += "\n  1. make sure KyTea is successfully installed."
            msg += "\n  2. make sure Mykytea-python is successfully installed."
            raise ImportError(msg)
github himkt / tiny_tokenizer / konoha / word_tokenizers / janome_tokenizer.py View on Github external
from typing import List
from typing import Optional

from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class JanomeTokenizer(BaseTokenizer):
    """Wrapper class for Janome."""

    def __init__(
        self, user_dictionary_path: Optional[str] = None, with_postag: bool = False
    ) -> None:
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            msg = "Importing janome failed for some reason."
            msg += "\n  1. make sure janome is successfully installed."
            raise ImportError(msg)

        super().__init__(name="janome", with_postag=with_postag)
        self._tokenizer = Tokenizer(udic=user_dictionary_path)

    def tokenize(self, text: str) -> List[Token]: