How to use the wordfreq.simple_tokenize function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / tests / test_japanese.py View on Github external
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
    assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]

    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
    assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
github LuminosoInsight / wordfreq / tests / test_japanese.py View on Github external
# between katakana and romaji is.
    #
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
    assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
    

    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
    assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]

    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
    assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
github LuminosoInsight / wordfreq / tests / test_japanese.py View on Github external
# tagged with the wrong language, or because we want to pass through
    # Japanese text without getting MeCab involved -- it will be split at
    # boundaries between Japanese and non-Japanese scripts, but all Japanese
    # scripts will be stuck together. Here the switch between hiragana
    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
    # between katakana and romaji is.
    #
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
    assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
    

    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
    assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]

    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
github LuminosoInsight / wordfreq / tests / test_japanese.py View on Github external
def test_simple_tokenize():
    # When Japanese is run through simple_tokenize -- either because it's
    # tagged with the wrong language, or because we want to pass through
    # Japanese text without getting MeCab involved -- it will be split at
    # boundaries between Japanese and non-Japanese scripts, but all Japanese
    # scripts will be stuck together. Here the switch between hiragana
    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
    # between katakana and romaji is.
    #
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
    assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
    

    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
github LuminosoInsight / wordfreq / wordfreq_builder / wordfreq_builder / word_counts.py View on Github external
If `cutoff` is greater than 0 or `max_words` is smaller than the list,
    the csv file must be sorted by value in descending order, so that the
    most frequent words are kept.

    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
            if val < cutoff or len(values) >= max_words:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                values[token] += val
                total += val
    return values, total
github LuminosoInsight / python-ftfy / scripts / mojibakery.py View on Github external
def find_mojibake(normal_freqs, baked_freqs):
    mojibake_items = []
    for (trigram, encoder, decoder), freq in baked_freqs.items():
        if trigram not in normal_freqs and trigram.lower() not in normal_freqs and not exclude_trigram(trigram):
            tokenized = ' '.join(wordfreq.simple_tokenize(trigram))
            if len(tokenized) == len(trigram):
                mojibake_items.append((int(freq * 1e6), trigram, encoder, decoder))
    mojibake_items.sort(reverse=True)
    return mojibake_items[:50000]