Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
def handle_entityref(self, name):
codepoint = name2codepoint[name]
self.result.append(chr(codepoint))
:returns: cleaned plain text
:rtype: :class:`str`
.. versionadded:: 0.4.0
The ``base_uri`` parameter.
"""
parser = HtmlSanitizer(base_uri)
parser.feed(html)
return ''.join(parser.fed)
class MarkupTagCleaner(HTMLParser.HTMLParser):
"""HTML parser that is internally used by :func:`clean_html()` function."""
entity_map = htmlentitydefs.name2codepoint
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def handle_entityref(self, name):
try:
codepoint = self.entity_map[name]
except KeyError:
pass
else:
self.fed.append(unichr(codepoint))
def _decode_entity(self, match):
what = match.group(1)
if what.startswith('#x'):
what = int(what[2:], 16)
elif what.startswith('#'):
what = int(what[1:])
else:
from html.entities import name2codepoint
what = name2codepoint.get(what, match.group(0))
return chr(what)
def substitute(match):
ent = match.group(2)
if match.group(1) == "#":
try:
return unichr(int(ent))
except:
return chr(int(ent))
else:
cp = n2cp.get(ent)
if cp:
try:
return unichr(cp)
except:
return chr(cp)
else:
return match.group()
>>> html.p(class_='foo', *[html.a('foo', href='foo.html'), ' ',
... html.a('bar', href='bar.html')])
u'<p class="foo"><a href="foo.html">foo</a> <a href="bar.html">bar</a></p>'
This class works around some browser limitations and can not be used for
arbitrary SGML/XML generation. For that purpose lxml and similar
libraries exist.
Calling the builder escapes the string passed:
>>> html.p(html(""))
u'<p><foo></p>'
"""
_entity_re = re.compile(r"&([^;]+);")
_entities = name2codepoint.copy()
_entities["apos"] = 39
_empty_elements = {
"area",
"base",
"basefont",
"br",
"col",
"command",
"embed",
"frame",
"hr",
"img",
"input",
"keygen",
"isindex",
"link",
def htmlentitydecode(s):
# First convert alpha entities (such as é)
# (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
def entity2char(m):
entity = m.group(1)
if entity in html.entities.name2codepoint:
return chr(html.entities.name2codepoint[entity])
return " " # Unknown entity: We replace with a space.
t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)
# Then convert numerical entities (such as é)
t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t)
# Then convert hexa entities (such as é)
return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t)
def entity2char(m):
entity = m.group(1)
if entity in html.entities.name2codepoint:
return chr(html.entities.name2codepoint[entity])
return " " # Unknown entity: We replace with a space.
t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)