Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extracthtmlurls(mesg):
"""Extract URLs with context from html type message. Similar to extracturls.
"""
chunk = HTMLChunker()
chunk.feed(mesg)
chunk.close()
# above_context = 1
# below_context = 1
def somechunkisurl(chunks):
for chnk in chunks:
if chnk.url is not None:
return True
return False
return extract_with_context(chunk.rval, somechunkisurl, 1, 1)
def handle_entityref(self, name):
if name in HTMLChunker.entities:
self.handle_data(HTMLChunker.entities[name])
else:
# If you see a reference, it needs to be
# added above.
self.handle_data('&%s;' % name)
def handle_starttag(self, tag, attrs):
if tag == 'a':
self.anchor_stack.append(self.findattr(attrs, 'href'))
elif tag in ('ul', 'ol'):
self.list_stack.append((tag, 1))
self.end_para()
elif tag in HTMLChunker.tag_styles:
self.style_stack.append(self.style_stack[-1] |
set([HTMLChunker.tag_styles[tag]]))
elif isheadertag(tag):
self.style_stack.append(self.style_stack[-1] | set(['bold']))
elif tag in ('p', 'br'):
self.end_para()
elif tag == 'img':
# Since we expect HTML *email*, image links
# should be external (naja?)
alt = self.findattr(attrs, 'alt')
if alt is None:
alt = '[IMG]'
src = self.findattr(attrs, 'src')
if src is not None and not src.startswith(('http://', 'https://')):
src = None
def handle_charref(self, name):
if name[0] == 'x':
char = int(name[1:], 16)
else:
char = int(name)
if char < 128:
name = chr(char)
elif char in HTMLChunker.extrachars:
name = HTMLChunker.extrachars[char]
else:
name = '&#%s;' % name
self.handle_data(name)