Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Take in multiple files of word counts by their filename, and produce a
frequency list in the named output file. The counts should be in the format
we produce that has a __total__ at the top. We merge them into a single
frequency list using the 'figure skating average' defined above.
"""
freq_dicts = []
for input_filename in input_filenames:
freq_dict = defaultdict(float)
with open(input_filename, encoding='utf-8') as infile:
total = None
for line in infile:
word, strcount = line.rstrip().split('\t', 1)
# Correct for earlier steps that might not have handled curly
# apostrophes consistently
word = uncurl_quotes(word).strip("' ")
if word:
count = int(strcount)
if word == '__total__':
total = count
else:
freq = count / total
if freq < 1e-9:
break
freq_dict[word] += freq
freq_dicts.append(freq_dict)
merged_dict = merge_freqs(freq_dicts)
with open(output_filename, 'w', encoding='utf-8') as outfile:
_write_frequency_file(merged_dict, outfile)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom:
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text
def _text_standardize(text):
"""
Fixes some issues the spacy tokenizer had on books corpus
Also handles whitespace standardization
"""
text = re.sub(
"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""",
r" \1 ",
text,
)
text = re.sub("\s*\n\s*", " \n ", text)
text = re.sub("[^\S\n]+", " ", text)
return uncurl_quotes(text.strip().lower())
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_encoding(text)
if fix_entities:
text = fixes.unescape_html(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom and not remove_control_chars:
# Skip this step if we've already done `remove_control_chars`,
# because it would be redundant.
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text