Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
assert simple_tokenize('おはようございます') == ['おはようございます']
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
assert simple_tokenize("晴々しい") == ["晴々しい"]
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
# between katakana and romaji is.
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
assert simple_tokenize('おはようございます') == ['おはようございます']
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
assert simple_tokenize("晴々しい") == ["晴々しい"]
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
# tagged with the wrong language, or because we want to pass through
# Japanese text without getting MeCab involved -- it will be split at
# boundaries between Japanese and non-Japanese scripts, but all Japanese
# scripts will be stuck together. Here the switch between hiragana
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
# between katakana and romaji is.
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
assert simple_tokenize('おはようございます') == ['おはようございます']
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
assert simple_tokenize("晴々しい") == ["晴々しい"]
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
def test_simple_tokenize():
# When Japanese is run through simple_tokenize -- either because it's
# tagged with the wrong language, or because we want to pass through
# Japanese text without getting MeCab involved -- it will be split at
# boundaries between Japanese and non-Japanese scripts, but all Japanese
# scripts will be stuck together. Here the switch between hiragana
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
# between katakana and romaji is.
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
assert simple_tokenize('おはようございます') == ['おはようございます']
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
assert simple_tokenize("晴々しい") == ["晴々しい"]
# Explicit word separators are still token boundaries, such as the dot
If `cutoff` is greater than 0 or `max_words` is smaller than the list,
the csv file must be sorted by value in descending order, so that the
most frequent words are kept.
If `lang` is given, it will apply language-specific tokenization to the
words that it reads.
"""
values = defaultdict(float)
total = 0.
with open(filename, encoding='utf-8', newline='') as infile:
for key, strval in csv.reader(infile):
val = float(strval)
key = fix_text(key)
if val < cutoff or len(values) >= max_words:
break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens:
# Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing
values[token] += val
total += val
return values, total
def find_mojibake(normal_freqs, baked_freqs):
mojibake_items = []
for (trigram, encoder, decoder), freq in baked_freqs.items():
if trigram not in normal_freqs and trigram.lower() not in normal_freqs and not exclude_trigram(trigram):
tokenized = ' '.join(wordfreq.simple_tokenize(trigram))
if len(tokenized) == len(trigram):
mojibake_items.append((int(freq * 1e6), trigram, encoder, decoder))
mojibake_items.sort(reverse=True)
return mojibake_items[:50000]