Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dale_chall(text, expected, nlp):
text = ftfy.fix_text(text)
text = " ".join(text.split())
doc = nlp(text)
assert pytest.approx(expected, rel=1e-2) == doc._.dale_chall
def test_linsear_write(text, expected, nlp):
text = ftfy.fix_text(text)
text = " ".join(text.split())
doc = nlp(text)
assert pytest.approx(expected, rel=1e-2) == doc._.linsear_write
def test_entities():
example = '&\n\n&'
assert fix_text(example) == '&\n\n&'
assert fix_text_segment(example) == '&\n\n&'
assert fix_text(example, fix_entities=True) == '&\n\n&'
assert fix_text_segment(example, fix_entities=True) == '&\n\n&'
assert fix_text(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment('<>', fix_entities=False) == '<>'
assert fix_text_segment('<>', fix_entities=True) == '<>'
assert fix_text_segment('<>') == '<>'
assert fix_text_segment('jednocześnie') == 'jednocześnie'
assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('broken') == 'broken\x81'
assert unescape_html('euro €') == 'euro €'
assert unescape_html('not an entity x6;') == 'not an entity x6;'
assert fix_text(example, fix_entities=True) == '&\n\n&'
assert fix_text_segment(example, fix_entities=True) == '&\n\n&'
assert fix_text(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment('<>', fix_entities=False) == '<>'
assert fix_text_segment('<>', fix_entities=True) == '<>'
assert fix_text_segment('<>') == '<>'
assert fix_text_segment('jednocześnie') == 'jednocześnie'
assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('broken') == 'broken\x81'
assert unescape_html('euro €') == 'euro €'
assert unescape_html('not an entity x6;') == 'not an entity x6;'
def check_ftfy(self, text, encoding_only=True):
"""
Given a single text input, check whether `ftfy.fix_text_encoding`
would change it. If so, display the change.
"""
self.count += 1
text = unescape_html(text)
if not possible_encoding(text, 'ascii'):
if encoding_only:
fixed = fix_encoding(text)
else:
fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
if text != fixed:
# possibly filter common bots before printing
print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
text=text, fixed=fixed
))
self.num_fixed += 1
elif 'â€' in text or '\x80' in text:
print('\nNot fixed:\t{text!r}'.format(text=text))
# Print status updates once in a while
if self.count % 100 == 0:
def test_surrogates():
assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
assert fix_surrogates('\ud800\udc00') == '\U00010000'
def test_surrogates():
assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
assert fix_surrogates('\ud800\udc00') == '\U00010000'
def check_ftfy(self, text, encoding_only=True):
"""
Given a single text input, check whether `ftfy.fix_text_encoding`
would change it. If so, display the change.
"""
self.count += 1
text = unescape_html(text)
if not possible_encoding(text, 'ascii'):
if encoding_only:
fixed = fix_encoding(text)
else:
fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
if text != fixed:
# possibly filter common bots before printing
print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
text=text, fixed=fixed
))
self.num_fixed += 1
elif 'â€' in text or '\x80' in text:
print('\nNot fixed:\t{text!r}'.format(text=text))
# Print status updates once in a while
if self.count % 100 == 0:
print('.', end='', flush=True)
if self.count % 10000 == 0:
print('\n%d/%d fixed' % (self.num_fixed, self.count))
def test_unknown_emoji():
# The range we accept as emoji has gotten larger. Let's make sure we can
# decode the futuristic emoji U+1F960, which will probably be a picture of
# a fortune cookie in Unicode 10.0:
emoji_text = "\U0001f960 I see emoji in your future"
emojibake = emoji_text.encode('utf-8').decode('windows-1252')
assert fix_encoding(emojibake) == emoji_text
# We believe enough in the future of this codepoint that we'll even
# recognize it with a mangled byte A0
emojibake = emojibake.replace('\xa0', ' ')
assert fix_encoding(emojibake) == emoji_text
# Increment the first byte to get a very similar test case, but a
# codepoint that will definitely not exist anytime soon. In this case,
# we consider the existing text, "ñŸ¥\xa0", to be more probable.
not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
assert fix_encoding(not_emoji) == not_emoji
def test_unknown_emoji():
# The range we accept as emoji has gotten larger. Let's make sure we can
# decode the futuristic emoji U+1F960, which will probably be a picture of
# a fortune cookie in Unicode 10.0:
emoji_text = "\U0001f960 I see emoji in your future"
emojibake = emoji_text.encode('utf-8').decode('windows-1252')
assert fix_encoding(emojibake) == emoji_text
# We believe enough in the future of this codepoint that we'll even
# recognize it with a mangled byte A0
emojibake = emojibake.replace('\xa0', ' ')
assert fix_encoding(emojibake) == emoji_text
# Increment the first byte to get a very similar test case, but a
# codepoint that will definitely not exist anytime soon. In this case,
# we consider the existing text, "ñŸ¥\xa0", to be more probable.
not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
assert fix_encoding(not_emoji) == not_emoji