Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
while True:
origtext = text
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_encoding(text)
if fix_entities:
text = fixes.unescape_html(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom and not remove_control_chars:
# Skip this step if we've already done `remove_control_chars`,
# because it would be redundant.
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text
- Posts in English should have score >= 2 (they should have net upvotes)
- Other posts should have score >= 1 (no net downvotes)
- Posts from subreddits that are banned in 2018 are skipped
"""
for line in infile:
data = json.loads(line)
if (
'score' in data and 'body' in data and
data["score"] is not None and data["score"] >= 1 and
data["body"] != "[deleted]"
):
subreddit = data["subreddit"]
subreddit_hash = mmh3.hash(subreddit)
if subreddit_hash not in BANNED_SUBREDDITS:
md = fix_surrogates(unescape_html(fix_line_breaks(data["body"])))
text = strip_markdown(md)
text = text.replace("\n", " ").replace("\u200b", "")
text = URL_RE.sub("", text)
if text:
lang, confident = detect_language(text)
if confident:
# There are more English posts than we need, so filter them
# for score >= 2
if lang != "en" or data["score"] > 1:
print(f"{lang}\t{text}", file=outfile)
while True:
origtext = text
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom:
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text