Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_fake_encoding_in_meta(self):
html = ' ľščťžäňôě'
with pytest.raises(JustextError):
decode_html(html.encode("iso-8859-2"), errors='strict')
def test_unknown_encoding_in_strict_mode(self):
html = 'ľščťžäňôě'
with pytest.raises(JustextError):
decode_html(html.encode("iso-8859-2"), errors='strict')
def get_article(entry):
page = ""
content = ""
picture = ""
media = ""
page = get_html(entry.link)
language = guess_language(page)
try:
content = remove_boilerplate(page, language=language)
except justext.core.JustextError:
pass
try:
picture = find_picture(page)
except requests.exceptions.Timeout:
pass
media = find_media(page)
keywords = find_keywords(entry.title)
article = {"link": entry.link,
"title": entry.title,
"release": time(),
"content": content,
"media": media,
"image": picture,
"keywords": keywords,
PARAGRAPH_TAGS = [
'body', 'blockquote', 'caption', 'center', 'col', 'colgroup', 'dd',
'div', 'dl', 'dt', 'fieldset', 'form', 'legend', 'optgroup', 'option',
'p', 'pre', 'table', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr',
'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
]
DEFAULT_ENCODING = 'utf8'
DEFAULT_ENC_ERRORS = 'replace'
CHARSET_META_TAG_PATTERN = re.compile(br"""]+charset=["']?([^'"/>\s]+)""", re.IGNORECASE)
class JustextError(Exception):
"Base class for jusText exceptions."
class JustextInvalidOptions(JustextError):
pass
def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
"""Converts HTML to DOM."""
if isinstance(html, unicode):
decoded_html = html
# encode HTML for case it's XML with encoding declaration
forced_encoding = encoding if encoding else default_encoding
html = html.encode(forced_encoding, errors)
else:
decoded_html = decode_html(html, default_encoding, encoding, errors)
try:
dom = lxml.html.fromstring(decoded_html, parser=lxml.html.HTMLParser())
except ValueError: