Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_word_tokenize_with_whitespace(self):
"""Test Character tokenizer."""
tokenizer1 = WordTokenizer(tokenizer="Whitespace")
tokenizer2 = WordTokenizer(tokenizer="whitespace")
# assert tokenizer1 == tokenizer2
expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] # NOQA
result1 = tokenizer1.tokenize(SENTENCE3)
result2 = tokenizer2.tokenize(SENTENCE3)
assert expect == result1 # NOQA
assert result1 == result2
def test_word_tokenize_with_mecab_whitespace():
try:
import natto
del natto
except ImportError:
pytest.skip("natto-py is not installed.")
tokenizer = WordTokenizer(tokenizer="MeCab")
expect = [Token(surface=w) for w in "吾輩 は で ある".split(" ")]
result = tokenizer.tokenize("吾輩は である")
assert expect == result
def test_kytea_with_s3_model():
try:
import boto3
del boto3
except ImportError:
pytest.skip("skip s3 test because of missing boto3")
try:
import Mykytea
del Mykytea
except ImportError:
pytest.skip("KyTea is not installed.")
try:
tokenizer = WordTokenizer(
tokenizer="KyTea", model_path="s3://konoha-demo/kytea/model.knm"
)
except ImportError:
pytest.skip("KyTea is not installed.")
expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")] # NOQA
result = tokenizer.tokenize("吾輩は猫である")
assert expect == result
def test_word_tokenize_with_kytea_using_custom_model():
try:
import Mykytea
del Mykytea
except ImportError:
pytest.skip("KyTea is not installed.")
tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm")
expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]
result = tokenizer.tokenize("吾輩は猫である")
assert expect == result
def test_word_tokenize_with_sentencepiece():
try:
import sentencepiece
del sentencepiece
except ImportError:
pytest.skip("Sentencepiece is not installed.")
tokenizer = WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm")
expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]
result = tokenizer.tokenize("吾輩は猫である")
assert expect == result
def test_word_tokenize_with_mecab():
try:
import natto
del natto
except ImportError:
pytest.skip("natto-py is not installed.")
tokenizer = WordTokenizer(tokenizer="MeCab")
expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
result = tokenizer.tokenize("吾輩は猫である")
assert expect == result
def test_word_tokenize_with_mecab(self):
"""Test MeCab tokenizer."""
try:
tokenizer1 = WordTokenizer(tokenizer="MeCab")
tokenizer2 = WordTokenizer(tokenizer="mecab")
except ImportError:
pytest.skip("skip mecab")
expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] # NOQA
result1 = tokenizer1.tokenize(SENTENCE1)
result2 = tokenizer2.tokenize(SENTENCE1)
assert expect == result1 # NOQA
assert result1 == result2
def test_word_tokenize_with_sudachi_mode_b(self):
"""Test Sudachi tokenizer."""
try:
tokenizer = WordTokenizer(tokenizer="Sudachi", mode="B")
except ImportError:
pytest.skip("skip sudachi")
expect = [Token(surface=w) for w in "医薬品 安全 管理 責任者".split(" ")]
result = tokenizer.tokenize(SENTENCE2)
self.assertEqual(expect, result)
def test_word_tokenize_with_sudachi_mode_a(self):
"""Test Sudachi tokenizer."""
try:
tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A")
except ImportError:
pytest.skip("skip sudachi")
expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")]
result = tokenizer.tokenize(SENTENCE2)
self.assertEqual(expect, result)
tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("kytea", with_postag=False)
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("kytea", with_postag=True)
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("janome", with_postag=True)
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("sentencepiece", model_path="./data/model.spm")
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("sudachi", mode="A")
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("character")
print(tokenizer.tokenize("我輩は猫である"))
tokenizer = WordTokenizer("whitespace")
print(tokenizer.tokenize("我輩 は 猫 で ある"))