Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# enough of the following bytes to decode anything, so consume
# zero bytes and wait.
return '', 0
else:
if CESU8_RE.match(input):
# If this is a CESU-8 sequence, do some math to pull out
# the intended 20-bit value, and consume six bytes.
bytenums = bytes_to_ints(input[:6])
codepoint = (
((bytenums[1] & 0x0f) << 16) +
((bytenums[2] & 0x3f) << 10) +
((bytenums[4] & 0x0f) << 6) +
(bytenums[5] & 0x3f) +
0x10000
)
return unichr(codepoint), 6
else:
# This looked like a CESU-8 sequence, but it wasn't one.
# 0xed indicates the start of a three-byte sequence, so give
# three bytes to the superclass, so it can either decode them
# as a surrogate codepoint (on Python 2) or handle the error
# (on Python 3).
return sup(input[:3], errors, False)
def fixup(match):
"""
Replace one matched HTML entity with the character it represents,
if possible.
"""
text = match.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return HTML_ENTITY_RE.sub(fixup, text)
if len(input) < 6:
if final:
return sup(input, errors, True)
else:
return '', 0
else:
if CESU8_RE.match(input):
bytenums = bytes_to_ints(input[:6])
codepoint = (
((bytenums[1] & 0x0f) << 16) +
((bytenums[2] & 0x3f) << 10) +
((bytenums[4] & 0x0f) << 6) +
(bytenums[5] & 0x3f) +
0x10000
)
return unichr(codepoint), 6
else:
return sup(input[:3], errors, False)
if possible.
"""
text = match.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return HTML_ENTITY_RE.sub(fixup, text)
def remove_bom(text):
r"""
Remove a left-over byte-order mark.
>>> print(remove_bom(unichr(0xfeff) + "Where do you want to go today?"))
Where do you want to go today?
"""
return text.lstrip(unichr(0xfeff))