Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
assert fix_text(example, fix_entities=True) == '&\n\n&'
assert fix_text_segment(example, fix_entities=True) == '&\n\n&'
assert fix_text(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment('<>', fix_entities=False) == '<>'
assert fix_text_segment('<>', fix_entities=True) == '<>'
assert fix_text_segment('<>') == '<>'
assert fix_text_segment('jednocześnie') == 'jednocześnie'
assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('broken') == 'broken\x81'
assert unescape_html('euro €') == 'euro €'
assert unescape_html('not an entity x6;') == 'not an entity x6;'
def check_ftfy(self, text, encoding_only=True):
"""
Given a single text input, check whether `ftfy.fix_text_encoding`
would change it. If so, display the change.
"""
self.count += 1
text = unescape_html(text)
if not possible_encoding(text, 'ascii'):
if encoding_only:
fixed = fix_encoding(text)
else:
fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
if text != fixed:
# possibly filter common bots before printing
print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
text=text, fixed=fixed
))
self.num_fixed += 1
elif 'â€' in text or '\x80' in text:
print('\nNot fixed:\t{text!r}'.format(text=text))
# Print status updates once in a while
if self.count % 100 == 0:
See `fix_text` for a description of the parameters.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_encoding(text)
if fix_entities:
text = fixes.unescape_html(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom and not remove_control_chars:
# Skip this step if we've already done `remove_control_chars`,
# because it would be redundant.
text = fixes.remove_bom(text)
# replace internal links with just their labels
text = replace_internal_links(text)
# text = replace_internal_links(text) # TODO: is this needed?
# remove table markup
text = text.replace('||', '\n|').replace('!!', '\n!') # put each cell on a separate line
text = re_table_formatting.sub('\n', text) # remove formatting lines
text = re_table_cell_formatting.sub('\n\\3', text) # leave only cell content
# strip out text formatting
text = re_italic_quote.sub(r'"\1"', text)
text = re_bold_italic.sub(r'\1', text)
text = re_quote_quote.sub(r'"\1"', text)
# unescape html entities
text = ftfy.fixes.unescape_html(text)
# final cleanup
text = re_headings.sub(r'\n\n\2\n\n', text)
text = re_dots.sub('...', text)
text = re_brackets.sub(r'', text)
text = text.replace('[[', '').replace(']]', '')
text = text.replace('<<', '«').replace('>>', '»')
text = re_random_cruft.sub(r'\1', text)
text = re.sub(r'\n\W+?\n', r'\n', text, flags=re.UNICODE)
text = text.replace(',,', ',').replace(',.', '.')
text = re_spaces.sub(' ', text)
text = re_linebreaks.sub(r'\n\n', text)
return text.strip()
"""
Apply fixes to text in a single chunk. This could be a line of text
within a larger run of `fix_text`, or it could be a larger amount
of text that you are certain is in a consistent encoding.
See `fix_text` for a description of the parameters.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
text = _replace_external_links(text)
# drop magic words behavioral switches
text = re_magic_words.sub("", text)
# replace internal links with just their labels
text = _replace_internal_links(text)
# text = _replace_internal_links(text) # TODO: is this needed?
# remove table markup
text = text.replace("||", "\n|").replace("!!", "\n!") # put each cell on a new line
text = re_table_formatting.sub("\n", text) # remove formatting lines
text = re_table_cell_formatting.sub("\n\\3", text) # leave only cell content
# strip out text formatting
text = re_italic_quote.sub(r'"\1"', text)
text = re_bold_italic.sub(r"\1", text)
text = re_quote_quote.sub(r'"\1"', text)
# unescape html entities
text = ftfy.fixes.unescape_html(text)
# final cleanup
if include_headings is True:
text = re_headings.sub(r"\n\n\2\n\n", text)
else:
text = re_headings.sub(r"\n\n", text)
text = re_dots.sub("...", text)
text = re_brackets.sub(r"", text)
text = text.replace("[[", "").replace("]]", "")
text = text.replace("<<", "«").replace(">>", "»")
text = re_random_cruft.sub(r"\1", text)
text = re.sub(r"\n\W+?\n", r"\n", text, flags=re.UNICODE)
text = text.replace(",,", ",").replace(",.", ".")
text = re_spaces.sub(" ", text)
text = re_linebreaks.sub(r"\n\n", text)
return text.strip()