How to use the wordfreq.tokenize function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_arabic():
    # Remove tatweels
    assert tokenize('متــــــــعب', 'ar') == ['متعب']

    # Remove combining marks
    assert tokenize('حَرَكَات', 'ar') == ['حركات']

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_arabic():
    # Remove tatweels
    assert tokenize('متــــــــعب', 'ar') == ['متعب']

    # Remove combining marks
    assert tokenize('حَرَكَات', 'ar') == ['حركات']

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
github LuminosoInsight / wordfreq / tests / test_chinese.py View on Github external
def test_alternate_codes():
    # Tokenization of Chinese works when you use other language codes
    # that are not equal to 'zh'.
    tokens = ['谢谢', '谢谢']

    # Code with a region attached
    assert tokenize('谢谢谢谢', 'zh-CN') == tokens

    # Over-long codes for Chinese
    assert tokenize('谢谢谢谢', 'chi') == tokens
    assert tokenize('谢谢谢谢', 'zho') == tokens

    # Separate codes for Mandarin and Cantonese
    assert tokenize('谢谢谢谢', 'cmn') == tokens
    assert tokenize('谢谢谢谢', 'yue') == tokens
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_casefolding():
    assert tokenize('WEISS', 'de') == ['weiss']
    assert tokenize('weiß', 'de') == ['weiss']
    assert tokenize('İstanbul', 'tr') == ['istanbul']
    assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
    assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']

    # Test Khmer, a script similar to Thai
    assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']

    # Remove vowel points in Hebrew
    assert tokenize('דֻּגְמָה', 'he') == ['דגמה']

    # Deal with commas, cedillas, and I's in Turkish
    assert tokenize('kișinin', 'tr') == ['kişinin']
    assert tokenize('KİȘİNİN', 'tr') == ['kişinin']

    # Deal with cedillas that should be commas-below in Romanian
    assert tokenize('acelaşi', 'ro') == ['același']
    assert tokenize('ACELAŞI', 'ro') == ['același']
github LuminosoInsight / wordfreq / tests / test_transliteration.py View on Github external
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (
        tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )
    assert (
        tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
github LuminosoInsight / wordfreq / tests / test_french_and_related.py View on Github external
def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
    assert tokenize("qu'un", 'fr') == ['qu', 'un']
    assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
    assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
    assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
    assert tokenize("l'heure", 'fr') == ['l', 'heure']
    assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
    assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
    assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
    assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
    assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
github LuminosoInsight / wordfreq / tests / test_transliteration.py View on Github external
def test_alternate_codes():
    # Try language codes for Serbo-Croatian that have been split, and now
    # are canonically mapped to Serbian
    assert tokenize("культуры", 'sh') == ["kul'tury"]
    assert tokenize("культуры", 'hbs') == ["kul'tury"]
github kmicklas / sentence-pairs / sort.py View on Github external
import sys
import wordfreq

if len(sys.argv) != 3:
    print('Usage: python3 sort.py target-lang pairs.csv')
    sys.exit(1)

targetLang = sys.argv[1]
pairsPath = sys.argv[2]

pairs = {}

with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
    reader = csv.reader(pairsFile, delimiter='\t')
    for row in reader:
        words = wordfreq.tokenize(html.unescape(row[0]), targetLang)

        freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
                     for word in words]

        minfreq = min(freqs)
        avgfreq = sum(freqs) / float(len(freqs))
        pairs[row[0]] = (minfreq, avgfreq, row[1])

pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])

for pair in pairList:
    sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))