Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
allowed = omit(allowed, omit_start)
excluded = {
"a": ["a"],
"button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
"dir": block,
"form": ["form"],
"label": ["label"],
"menu": block,
"pre": pre_exclusion
}
excluded = setify(excluded)
class HTMLParser(base.HTMLParser):
def __init__(self, entities=None):
base.HTMLParser.__init__(self)
self.tag_stack = []
self.excluded = frozenset()
self.excluded_stack = []
self.data = []
self.data_stack = []
self.decls = []
if entities:
self.entities = entities
else:
self.entities = {}
def top(self):
if self.tag_stack == []:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class limetorrents(object):
url = "https://www.limetorrents.info"
name = "LimeTorrents"
supported_categories = {'all': 'all',
'anime': 'anime',
'software': 'applications',
'games': 'games',
'movies': 'movies',
'music': 'music',
'tv': 'tv'}
class MyHtmlParser(HTMLParser):
""" Sub-class for parsing results """
def error(self, message):
pass
A, TD, TR, HREF = ('a', 'td', 'tr', 'href')
def __init__(self, url):
HTMLParser.__init__(self)
self.url = url
self.current_item = {} # dict for found item
self.item_name = None # key's name in current_item dict
self.page_empty = 22000
self.inside_tr = False
self.findTable = False
self.parser_class = {"tdnormal": "size", # class
def handle_endtag(self, tag):
if tag == 'div' and self.inside_results:
#We closed one of the tags
if (self.internal_div > 0):
self.internal_div -= 1
if (self.internal_div == 0):
self.inside_results = False
if self.inside_results:
if (tag == "li"):
name, surname, _ = get_name_surname_from_complete_name(self.name, convention="spanish_surname")
prof_record = gen_profile(name, surname)
prof_record.setWebReference(self.web_link)
prof_record.setCheckedDate("death", self.death_date.year, self.death_date.month,self.death_date.day,"EXACT")
prof_record.setComments(self.comments)
self.records.append(prof_record)
class RememoryPersonParser(HTMLParser):
'''
This function will parser an specific individual to extract specific data useful for comparison
'''
def __init__(self):
'''
As input we intoduce
profile: a generic profile to be updated
url: the specific url address
'''
HTMLParser.__init__(self)
self.location = None
self.age = None
self.located = False
def handle_starttag(self, tag, attrs):
if tag == "br":
self.located = True
def __init__(self, parent, child):
html.parser.HTMLParser.__init__(self)
self.parenttag = parent
self.childtag = child
self.results = []
self.current = None
if self._start_level is not None:
if end:
if level == self._start_level:
self._start_level = None
spaces = ''.join([' ' for i in range(level)])
attr_string = None
if attrs:
attr_string = ''
for (k, v) in attrs:
attr_string += ' ' + str(k) + '=\"' + str(v) + '\"'
print(('%s<%s%s%s>%s'%(spaces, ('/' if end else ''), tag, (attr_string if attr_string else ''),
(' ' + msg if msg else ''))))
class HtmlParser(HTMLParser):
def __init__(self, skip_tags=[], ddump=None):
self._root = None
self._stack = []
self._skip_tags = skip_tags
self._skip = False, None
self._ddump = ddump
if is_py3():
HTMLParser.__init__(self, convert_charrefs=True)
else:
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if self._skip[0] == True:
return
from nom.md2html import compile_markdown
from port.fs import FileManager
from jinja2 import Environment, FileSystemLoader
meta_re = re.compile(r'^---\n(.*?)\n---', re.DOTALL)
title_re = re.compile(r'^#\s?([^#\n]+)')
md_img_re = re.compile(r'!\[.*?\]\(`?([^`\(\)]+)`?\)')
class Bunch():
def __init__(self, **data):
for k, v in data.items():
setattr(self, k, v)
class HTMLCleaner(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def remove_html(html):
cleaner = HTMLCleaner()
cleaner.feed(html)
return cleaner.get_data()
from html.parser import HTMLParser
class NodeParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.node_id = ""
self.stop_flag = False
def handle_starttag(self, tag, attrs):
if not self.stop_flag and tag == "option":
self.node_id = dict(attrs)["value"]
self.stop_flag = True
# TODO: Add error response here.
def error(self, message):
pass
def __init__(self, trace=False):
html.HTMLParser.__init__(self)
self.trace = trace
self.result_list = []
self.restart()
from io import StringIO
from html.parser import HTMLParser
import base64
class HTMLProcessor(HTMLParser):
# The HTMLProcessor replaces the src attribute in <img> tags with the base64 equivalent
# The image content comes from the zip_file (specified with set_src_source())
# It also strips
print("Here's what %s did with the markup:" % parser)
print(soup.prettify())
print("-" * 80)
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
def _p(self, s):
print(s)
def handle_starttag(self, name, attrs):
self._p("%s START" % name)
def handle_endtag(self, name):
self._p("%s END" % name)
def handle_data(self, data):
self._p("%s DATA" % data)
def handle_charref(self, name):
self._p("%s CHARREF" % name)