How to use the konoha.word_tokenizer.WordTokenizer function in konoha

To help you get started, we’ve selected a few konoha examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_whitespace(self):
        """Test Character tokenizer."""
        tokenizer1 = WordTokenizer(tokenizer="Whitespace")
        tokenizer2 = WordTokenizer(tokenizer="whitespace")
        # assert tokenizer1 == tokenizer2
        expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE3)
        result2 = tokenizer2.tokenize(SENTENCE3)
        assert expect == result1  # NOQA
        assert result1 == result2
github himkt / tiny_tokenizer / tests / word_tokenizer / test_mecab_tokenizer.py View on Github external
def test_word_tokenize_with_mecab_whitespace():
    try:
        import natto

        del natto
    except ImportError:
        pytest.skip("natto-py is not installed.")

    tokenizer = WordTokenizer(tokenizer="MeCab")
    expect = [Token(surface=w) for w in "吾輩 は   で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_kytea_tokenizer.py View on Github external
def test_kytea_with_s3_model():
    try:
        import boto3
        del boto3
    except ImportError:
        pytest.skip("skip s3 test because of missing boto3")

    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("KyTea is not installed.")

    try:
        tokenizer = WordTokenizer(
            tokenizer="KyTea", model_path="s3://konoha-demo/kytea/model.knm"
        )
    except ImportError:
        pytest.skip("KyTea is not installed.")

    expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]  # NOQA
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_kytea_tokenizer.py View on Github external
def test_word_tokenize_with_kytea_using_custom_model():
    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("KyTea is not installed.")

    tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm")
    expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_sentencepiece_tokenizer.py View on Github external
def test_word_tokenize_with_sentencepiece():
    try:
        import sentencepiece
        del sentencepiece
    except ImportError:
        pytest.skip("Sentencepiece is not installed.")

    tokenizer = WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm")
    expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_mecab_tokenizer.py View on Github external
def test_word_tokenize_with_mecab():
    try:
        import natto

        del natto
    except ImportError:
        pytest.skip("natto-py is not installed.")

    tokenizer = WordTokenizer(tokenizer="MeCab")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_mecab(self):
        """Test MeCab tokenizer."""
        try:
            tokenizer1 = WordTokenizer(tokenizer="MeCab")
            tokenizer2 = WordTokenizer(tokenizer="mecab")
        except ImportError:
            pytest.skip("skip mecab")

        expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE1)
        result2 = tokenizer2.tokenize(SENTENCE1)
        assert expect == result1  # NOQA
        assert result1 == result2
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_sudachi_mode_b(self):
        """Test Sudachi tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="Sudachi", mode="B")
        except ImportError:
            pytest.skip("skip sudachi")

        expect = [Token(surface=w) for w in "医薬品 安全 管理 責任者".split(" ")]
        result = tokenizer.tokenize(SENTENCE2)
        self.assertEqual(expect, result)
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_sudachi_mode_a(self):
        """Test Sudachi tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A")
        except ImportError:
            pytest.skip("skip sudachi")

        expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")]
        result = tokenizer.tokenize(SENTENCE2)
        self.assertEqual(expect, result)
github himkt / tiny_tokenizer / konoha / word_tokenizer.py View on Github external
tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("kytea", with_postag=False)
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("kytea", with_postag=True)
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("janome", with_postag=True)
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("sentencepiece", model_path="./data/model.spm")
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("sudachi", mode="A")
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("character")
    print(tokenizer.tokenize("我輩は猫である"))

    tokenizer = WordTokenizer("whitespace")
    print(tokenizer.tokenize("我輩 は 猫 で ある"))