Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_alternate_codes():
# Tokenization of Chinese works when you use other language codes
# that are not equal to 'zh'.
tokens = ['谢谢', '谢谢']
# Code with a region attached
assert tokenize('谢谢谢谢', 'zh-CN') == tokens
# Over-long codes for Chinese
assert tokenize('谢谢谢谢', 'chi') == tokens
assert tokenize('谢谢谢谢', 'zho') == tokens
# Separate codes for Mandarin and Cantonese
assert tokenize('谢谢谢谢', 'cmn') == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens
def test_casefolding():
assert tokenize('WEISS', 'de') == ['weiss']
assert tokenize('weiß', 'de') == ['weiss']
assert tokenize('İstanbul', 'tr') == ['istanbul']
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
# Test Khmer, a script similar to Thai
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
# Test Hindi -- tokens split where there are spaces, and not where there aren't
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
# Remove vowel points in Hebrew
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
# Deal with commas, cedillas, and I's in Turkish
assert tokenize('kișinin', 'tr') == ['kişinin']
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
# Deal with cedillas that should be commas-below in Romanian
assert tokenize('acelaşi', 'ro') == ['același']
assert tokenize('ACELAŞI', 'ro') == ['același']
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles
assert (
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
assert (
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
assert preprocess_text('китабхана', 'az') == 'kitabxana'
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
# 'scream' in Azerbaijani Cyrillic
assert preprocess_text('бағырты', 'az') == 'bağırtı'
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
def test_apostrophes():
# Test that we handle apostrophes in French reasonably.
assert tokenize("qu'un", 'fr') == ['qu', 'un']
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
assert tokenize("l'heure", 'fr') == ['l', 'heure']
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
assert tokenize("культуры", 'sh') == ["kul'tury"]
assert tokenize("культуры", 'hbs') == ["kul'tury"]
import sys
import wordfreq
if len(sys.argv) != 3:
print('Usage: python3 sort.py target-lang pairs.csv')
sys.exit(1)
targetLang = sys.argv[1]
pairsPath = sys.argv[2]
pairs = {}
with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
reader = csv.reader(pairsFile, delimiter='\t')
for row in reader:
words = wordfreq.tokenize(html.unescape(row[0]), targetLang)
freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
for word in words]
minfreq = min(freqs)
avgfreq = sum(freqs) / float(len(freqs))
pairs[row[0]] = (minfreq, avgfreq, row[1])
pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])
for pair in pairList:
sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))