Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from typing import List
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
class WhitespaceTokenizer(BaseTokenizer):
"""Simple rule-based word tokenizer."""
def __init__(self) -> None:
super(WhitespaceTokenizer, self).__init__("whitespace")
def tokenize(self, text: str) -> List[Token]:
return [Token(surface=surface) for surface in text.split(" ")]
from typing import List
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
class SentencepieceTokenizer(BaseTokenizer):
"""Wrapper class forSentencepiece"""
def __init__(self, model_path: str, **kwargs) -> None:
"""
Initializer for SentencepieceTokenizer.
Parameters
---
model_path (str)
path to sentencepiece model.
**kwargs
others.
"""
try:
import sentencepiece
return (
surface,
postag,
postag2,
postag3,
postag4,
inflection,
conjugation,
base_form,
yomi,
pron,
)
class MeCabTokenizer(BaseTokenizer):
"""Wrapper class forexternal text analyzers"""
def __init__(
self,
user_dictionary_path: Optional[str] = None,
system_dictionary_path: Optional[str] = None,
dictionary_format: Optional[str] = None,
with_postag: bool = False,
) -> None:
"""
Initializer for MeCabTokenizer.
Parameters
---
dictionary_path (Optional[str]=None)
path to a custom dictionary (option)
from typing import List
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
class SudachiTokenizer(BaseTokenizer):
"""Wrapper class for SudachiPy."""
def __init__(self, mode: str, with_postag: bool, **kwargs) -> None:
"""
Initializer for SudachiTokenizer.
Parameters
---
mode (str)
Splitting mode which controls a granuality ofkonoha.token.
(mode should be `A`, `B` or `C`)
For more information, see following links.
- document: https://github.com/WorksApplications/Sudachi#the-modes-of-splitting # NOQA
- paper: http://www.lrec-conf.org/proceedings/lrec2018/summaries/8884.html # NOQA
with_postag (bool=False)
flag determines ifkonoha.tokenizer include pos tags.
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
class CharacterTokenizer(BaseTokenizer):
"""Characterkonoha.tokenizer"""
def __init__(self):
super(CharacterTokenizer, self).__init__("character")
def tokenize(self, text: str):
return [Token(surface=char) for char in list(text)]
from typing import List
from typing import Optional
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
class KyTeaTokenizer(BaseTokenizer):
"""Wrapper class forKyTea"""
def __init__(
self, with_postag: bool = False, model_path: Optional[str] = None, **kwargs
) -> None:
super(KyTeaTokenizer, self).__init__(
name="kytea", with_postag=with_postag, model_path=model_path
)
try:
import Mykytea
except ImportError:
msg = "Importing kytea failed for some reason."
msg += "\n 1. make sure KyTea is successfully installed."
msg += "\n 2. make sure Mykytea-python is successfully installed."
raise ImportError(msg)
from typing import List
from typing import Optional
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
class JanomeTokenizer(BaseTokenizer):
"""Wrapper class for Janome."""
def __init__(
self, user_dictionary_path: Optional[str] = None, with_postag: bool = False
) -> None:
try:
from janome.tokenizer import Tokenizer
except ImportError:
msg = "Importing janome failed for some reason."
msg += "\n 1. make sure janome is successfully installed."
raise ImportError(msg)
super().__init__(name="janome", with_postag=with_postag)
self._tokenizer = Tokenizer(udic=user_dictionary_path)
def tokenize(self, text: str) -> List[Token]: