Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_minimums():
assert word_frequency('esquivalience', 'en') == 0
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
assert word_frequency('the', 'en', minimum=1) == 1
def test_combination():
gamsa_freq = word_frequency('감사', 'ko')
habnida_freq = word_frequency('합니다', 'ko')
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
assert (
1.0 / word_frequency('감사합니다', 'ko') ==
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
)
def test_language_matching():
freq = word_frequency('的', 'zh')
assert word_frequency('的', 'zh-TW') == freq
assert word_frequency('的', 'zh-CN') == freq
assert word_frequency('的', 'zh-Hant') == freq
assert word_frequency('的', 'zh-Hans') == freq
assert word_frequency('的', 'yue-HK') == freq
assert word_frequency('的', 'cmn') == freq
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
def compose_collaborative_pun_hit(data_dict, key_filter, outfile, top_k=5):
with open(outfile, 'w') as outf:
header = ['Pun_alter']
for i in range(top_k):
header.append('sentence_'+str(i+1))
assert len(header) == top_k + 1
outf.write(','.join(header)+'\n')
for key in key_filter:
results = data_dict[key]
if word_frequency(key[0], 'en') < 1e-6 or word_frequency(key[1], 'en') < 1e-6 :
print('skip the keyword pair:', ' '.join(key))
continue
contents = []
contents.append('-'.join(key))
if type(results) is tuple:
results = results[0]
for res in results[:top_k]:
contents.append(res)
#print(type(contents), contents)
outf.write(','.join(contents)+'\n')
def add_language_trigrams(normal_freqs, baked_freqs, language):
"""
Collect the trigram frequencies of both correct and mojibaked text, using
word examples from the given language.
"""
for baseword in wordfreq.iter_wordlist(language):
freq = wordfreq.word_frequency(baseword, language)
for word in set([baseword, baseword.upper()]):
if any(letter.isdigit() for letter in word):
continue
for frame in FRAMES:
padded = frame % word
for trigram in get_trigrams(padded):
normal_freqs[trigram] += freq
for enc1 in COMMON_ENCODINGS + LANGUAGE_ENCODINGS[language]:
for enc2 in COMMON_ENCODINGS + LANGUAGE_ENCODINGS[language]:
if enc1 != enc2 and (enc1 not in COMMON_ENCODINGS or enc2 not in COMMON_ENCODINGS):
try:
mojibaked = padded.encode(enc1).decode(enc2)
if mojibaked != padded:
for trigram in get_trigrams(mojibaked):
baked_freqs[(trigram, enc2, enc1)] += freq
out word if less frequent than this threshold
"""
# Datamuse is built from webscraping and occasionally returns offensive and oppressive language, which I am here
# adding to filter out. Although there is an appropriate and even critical way for humans to write poetry using some
# of these words that might be considered edge cases (e.g. Hottentot), a stochastic text generator does not have
# a historical sense to do that, so I have decided to exclude these.
unfitting_words = pkgutil.get_data('generativepoetry', 'data/hate_words.txt').decode("utf-8").splitlines()
unfitting_words.extend(pkgutil.get_data('generativepoetry', 'data/abbreviations_etc.txt').decode("utf-8")
.splitlines())
exclude_words.extend(unfitting_words) # Some words Datamuse tends to return that disruptive poetic flow
validate_str(string)
if len(string) < 3:
return False
if has_invalid_characters(string):
return False
if word_frequency(string, 'en') < word_frequency_threshold:
return False
if spellcheck and not hobj.spell(string):
return False
if string in exclude_words:
return False
return True
integral80 += word_frequency(nl[i], 'nl', wordlist='large')
if (integral80 <= 0.80*integral100):
nlPopular.write(nl[i] + '\n')
else:
nlLongTail.write(nl[i] + '\n')
nlPopular.close()
nlLongTail.close()
#---------------------------------------------------------------
ptPopular = open(dest + '/ptPopular.txt', 'w')
ptLongTail = open(dest + '/ptLongTail.txt', 'w')
integral100 = 0
for i in range(len(pt)):
integral100 += word_frequency(pt[i], 'pt', wordlist='large')
integral80 = 0
for i in range(len(pt)):
integral80 += word_frequency(pt[i], 'pt', wordlist='large')
if (integral80 <= 0.80*integral100):
ptPopular.write(pt[i] + '\n')
else:
ptLongTail.write(pt[i] + '\n')
ptPopular.close()
ptLongTail.close()
#---------------------------------------------------------------
svPopular = open(dest + '/svPopular.txt', 'w')
svLongTail = open(dest + '/svLongTail.txt', 'w')
def compute_word_frequency_norms(self):
freqs = []
for char in self.tokens:
freq = wordfreq.word_frequency(char, 'zh')
if freq == 0:
continue
freqs.append(freq)
try:
self.features['mean_word_frequency'] = statistics.mean(freqs)
self.features['median_word_frequency'] = statistics.median(freqs)
except:
self.features['mean_word_frequency'] = 0
self.features['median_word_frequency'] = 0
nlLongTail.write(nl[i] + '\n')
nlPopular.close()
nlLongTail.close()
#---------------------------------------------------------------
ptPopular = open(dest + '/ptPopular.words', 'w')
ptLongTail = open(dest + '/ptLongTail.words', 'w')
integral100 = 0
for i in range(len(pt)):
integral100 += word_frequency(pt[i], 'pt', wordlist='large')
integral80 = 0
for i in range(len(pt)):
integral80 += word_frequency(pt[i], 'pt', wordlist='large')
if (integral80 <= 0.80*integral100):
ptPopular.write(pt[i] + '\n')
else:
ptLongTail.write(pt[i] + '\n')
ptPopular.close()
ptLongTail.close()
integralList = []
integral = 0
f = []
for i in range(0, len(pt)):
f.insert(i, word_frequency(de[i], 'pt', wordlist='large')/word_frequency(de[0], 'pt', wordlist='large'))
integral += word_frequency(pt[i], 'pt', wordlist='large')
integralList.insert(i, integral)