How to use the lxml.etree.HTMLParser function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datashaman / wifidog-auth-flask / tests / __init__.py View on Github external
def get_html(self, response):
        data = response.get_data()
        parser = etree.HTMLParser()
        return etree.parse(six.StringIO(str(data)), parser)
github datashaman / wifidog-auth-flask / tests / test_unit.py View on Github external
def get_html(self, response):
        data = response.get_data()
        parser = etree.HTMLParser()
        return etree.parse(StringIO(response.get_data()), parser)
github bowenpay / wechat-spider / wechat / downloaders.py View on Github external
def download_wechat_keyword_topics(self, word, process_topic):
        """ 在关键词下的文章列表页面,逐一点击打开每一篇文章,并爬取 """
        browser = self.browser
        js = """ return document.documentElement.innerHTML; """
        body = browser.execute_script(js)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(StringIO(body), htmlparser)

        elems = [stringify_children(item).replace('red_beg', '').replace('red_end', '') for item in tree.xpath("//div[@class='txt-box']/h3/a")]
        hrefs = tree.xpath("//div[@class='txt-box']/h3/a/@href")
        #avatars = tree.xpath("//div[@class='img-box']/a/img/@src")
        #elems_abstracts = tree.xpath("//div[@class='txt-box']/p")
        #abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts]
        avatars = [''] * len(elems)
        abstracts = [''] * len(elems)
        links = []
        for idx, item in enumerate(elems):
            title = item
            print title
            if not title:
                continue
            uniqueid = get_uniqueid('%s:%s' % (word, title))
github fangpenlin / loso / loso / crawlers / hinet_news.py View on Github external
def parseHtml(html):
    parser = etree.HTMLParser(encoding='utf8')
    tree = etree.parse(StringIO.StringIO(html), parser)
    return tree
github Marduke / CalimeplPacz / plugins / palmknihy / __init__.py View on Github external
query = self.create_query(title=title, authors=authors)
        if not query:
            self.log('Insufficient metadata to construct query')
            return

        br = self.browser
        try:
            self.log('download page search %s'%query)
            raw = br.open(query, timeout=timeout).read().strip()
        except Exception as e:
            self.log.exception('Failed to make identify query: %r'%query)
            return as_unicode(e)

        try:
            parser = etree.HTMLParser()
            clean = clean_ascii_chars(raw)
            feed = fromstring(clean, parser=parser)
#             if len(parser.error_log) > 0: #some errors while parsing
#                 self.log('while parsing page occus some errors:')
#                 self.log(parser.error_log)

            more_pages = pages_count(feed)
            #more pages with search results
            que = Queue()
            if ident is not None:
                que.put([ident, title, authors])
            if len(more_pages) > 0:
                page_max = int(re.search("\d+", more_pages[0]).group()[-1])
            else:
                page_max = 1
github Marduke / CalimeplPacz / plugins / xtr / __init__.py View on Github external
formUrl = feed.xpath('//form[@id="form"]/@action')
            self.log('formUrl %s'%formUrl[0])
            
            url = self.BASE_URL + formUrl[0]

            parameters = {
                "sendform":"1",
                "login_name":self.prefs['login'],
                "login_password":self.prefs['password']
            }
            data = urllib.urlencode(parameters)
            self.log(url)
            self.log(data)
            clean = clean_ascii_chars(br.open(url,data).read().strip())
            parser = etree.HTMLParser(recover=True)
            feed = fromstring(clean, parser=parser)
            self.log(clean)
            return len(feed.xpath('//input[@id="login_name"]/@name')) == 0
        except Exception as e:
            self.log.exception(e)
            return False
github reagle / pandoc-wrappers / wiki-update.py View on Github external
def insert_todos(plan_fn, todos):

    info("insert_todos")
    html_parser = etree.HTMLParser(
        remove_comments=True, remove_blank_text=True
    )
    doc = etree.parse(open(plan_fn, "rb"), html_parser)
    div = doc.xpath('//div[@id="Ongoing-todos"]')[0]
    parent = div.getparent()
    parent.replace(div, todos)
    doc.write(plan_fn)
github mementoweb / timegate / timegate / examples / webcite.py View on Github external
try:
            req = urllib2.Request(wcurl, None, txheaders)
            fh = urllib2.urlopen(req)
            fh.close()

            req = urllib2.Request('http://webcitation.org/topframe.php')
            fh = urllib2.urlopen(req)
            data = fh.read()
            fh.close()
        except Exception as e:
            raise HandlerError('Cannot request page', 404)

        changes = []

        try:
            parser = etree.HTMLParser()
            dom = etree.parse(StringIO.StringIO(data), parser)
        except:
            raise HandlerError('Cannot parse HTML')

        opts = dom.xpath('//select[@name="id"]/option')
        for o in opts:
            fid = o.attrib['value']
            date = o.text
            if date.find('(failed)') > -1:
                continue

            changes.append(('http://webcitation.org/query?id=' + fid, date))

        return changes
github mollyproject / mollyproject / molly / molly / apps / external_media / sanitiser.py View on Github external
def sanitise_html(dirty_html, opener=None, device=None):
    html = etree.fromstring("<div>%s</div>" % dirty_html, parser = etree.HTMLParser())
    html = transform(html, 'external_media/html_sanitiser.xslt')
    
    #if True or device:
    #    for element in html.findall(".//img[@externalmedia]"):
    #        print element

    return etree.tostring(html, method='html')[5:-6] # serialize and remove the div tag
github mitodl / edxcut / edxcut / edxapi.py View on Github external
def list_courses(self):
        '''
        List courses available in Studio site
        '''
        self.ensure_studio_site()
        url = "%s/home/" % self.BASE
        ret = self.ses.get(url)
        parser = etree.HTMLParser()
        xml = etree.parse(StringIO(ret.content), parser).getroot()
        courses = []
        course_ids = []
        for course in xml.findall('.//li[@class="course-item"]'):
            cid = course.get("data-course-key")
            if self.verbose:
                print cid  # etree.tostring(course)
            courses.append(course)
            course_ids.append(cid)
        return {'xml': courses,
                'course_ids': course_ids,
                }