Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def check_ftfy(self, text, encoding_only=True):
"""
Given a single text input, check whether `ftfy.fix_text_encoding`
would change it. If so, display the change.
"""
self.count += 1
text = unescape_html(text)
if not possible_encoding(text, 'ascii'):
if encoding_only:
fixed = fix_encoding(text)
else:
fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
if text != fixed:
# possibly filter common bots before printing
print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
text=text, fixed=fixed
))
self.num_fixed += 1
elif 'â€' in text or '\x80' in text:
print('\nNot fixed:\t{text!r}'.format(text=text))
# Print status updates once in a while
if self.count % 100 == 0:
print('.', end='', flush=True)
def fix_text_and_explain(text):
"""
Performs a single step of re-encoding text that's been decoded incorrectly.
It returns the decoded text, plus a structure explaining what it did.
This structure could be used for more than it currently is, but we at least
use it to track whether we had to intepret text as an old encoding such as
MacRoman or cp437.
"""
if isinstance(text, bytes):
raise UnicodeError(BYTES_ERROR_TEXT)
if len(text) == 0:
return text, []
# The first plan is to return ASCII text unchanged.
if possible_encoding(text, 'ascii'):
return text, []
# As we go through the next step, remember the possible encodings
# that we encounter but don't successfully fix yet. We may need them
# later.
possible_1byte_encodings = []
# Suppose the text was supposed to be UTF-8, but it was decoded using
# a single-byte encoding instead. When these cases can be fixed, they
# are usually the correct thing to do, so try them next.
for encoding in FIXABLE_CHARMAP_ENCODINGS:
if possible_encoding(text, encoding):
print('possible encoding: %s' % encoding)
# This is an ugly-looking way to get the bytes that represent
# the text in this encoding. The reason we can't necessarily
# use .encode(encoding) is that the decoder is very likely
def learn_matrix(datafile):
matrix = np.ones((1 << 23, N), np.float32, order='F')
count = 0
for line in codecs.open(datafile, encoding='utf-8'):
count += 1
if count % 1000 == 0:
print(count)
if possible_encoding(line, 'ascii'):
continue
for i, encoding in enumerate(ENCODINGS):
try:
linebytes = line.encode(encoding)
for pos in range(1, len(linebytes) - 1):
if linebytes[pos] >= 0x80:
trigram = linebytes[pos-1:pos+2]
assert len(trigram) == 3
row = trigram_to_row(trigram)
matrix[row, i] += 1
except UnicodeEncodeError:
pass
norms = np.sum(matrix * matrix, axis=1)[:, np.newaxis]
return matrix / norms
return text, []
# The first plan is to return ASCII text unchanged.
if possible_encoding(text, 'ascii'):
return text, []
# As we go through the next step, remember the possible encodings
# that we encounter but don't successfully fix yet. We may need them
# later.
possible_1byte_encodings = []
# Suppose the text was supposed to be UTF-8, but it was decoded using
# a single-byte encoding instead. When these cases can be fixed, they
# are usually the correct thing to do, so try them next.
for encoding in FIXABLE_CHARMAP_ENCODINGS:
if possible_encoding(text, encoding):
print('possible encoding: %s' % encoding)
# This is an ugly-looking way to get the bytes that represent
# the text in this encoding. The reason we can't necessarily
# use .encode(encoding) is that the decoder is very likely
# to have been sloppier than Python.
#
# The decoder might have left bytes unchanged when they're not
# part of the encoding. It might represent b'\x81' as u'\x81'
# in Windows-1252, while Python would claim that using byte
# 0x81 in Windows-1252 is an error.
#
# So what we do here is we use the .translate method of Unicode
# strings. Using it with the character maps we have computed will
# give us back a Unicode string using only code
# points up to 0xff. This can then be converted into the intended
# bytes by encoding it as Latin-1.