Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
"""
Converts a `html` containing an HTML page into Unicode.
Tries to guess character encoding from meta tag.
"""
if isinstance(html, unicode):
return html
if encoding:
return html.decode(encoding, errors)
match = CHARSET_META_TAG_PATTERN.search(html)
if match:
declared_encoding = match.group(1).decode("ASCII")
# proceed unknown encoding as if it wasn't found at all
with ignored(LookupError):
return html.decode(declared_encoding, errors)
# unknown encoding
try:
# try UTF-8 first
return html.decode("utf8")
except UnicodeDecodeError:
# try lucky with default encoding
try:
return html.decode(default_encoding, errors)
except UnicodeDecodeError as e:
raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e))