Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def recount_messy(infile, outfile, language):
"""
Take in a file of counts from another source (such as Google Books), and
make it consistent with our tokenization and format.
"""
counts = Counter()
total = 0
for line in infile:
line = line.rstrip()
if line and not line.startswith('__total__'):
text, strcount = line.split('\t', 1)
count = int(strcount)
for token in lossy_tokenize(text, language, external_wordlist=True):
counts[token] += count
total += count
# Write the counted tokens to outfile
print('__total__\t{}'.format(total), file=outfile)
for token, count in counts.most_common():
if TOKEN_RE.match(token):
print('{}\t{}'.format(token, count), file=outfile)
def _word_frequency(word, lang, wordlist, minimum):
tokens = lossy_tokenize(word, lang)
if not tokens:
return minimum
# Frequencies for multiple tokens are combined using the formula
# 1 / f = 1 / f1 + 1 / f2 + ...
# Thus the resulting frequency is less than any individual frequency, and
# the smallest frequency dominates the sum.
freqs = get_frequency_dict(lang, wordlist)
one_over_result = 0.0
for token in tokens:
if token not in freqs:
# If any word is missing, just return the default value
return minimum
one_over_result += 1.0 / freqs[token]
freq = 1.0 / one_over_result