Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_select(self):
document = etree.fromstring(HTML_IDS)
sort_key = dict(
(el, count) for count, el in enumerate(document.getiterator())
).__getitem__
css_to_xpath = GenericTranslator().css_to_xpath
html_css_to_xpath = HTMLTranslator().css_to_xpath
def select_ids(selector, html_only):
xpath = css_to_xpath(selector)
items = document.xpath(xpath)
if html_only:
assert items == []
xpath = html_css_to_xpath(selector)
items = document.xpath(xpath)
items.sort(key=sort_key)
return [element.get('id', 'nil') for element in items]
def pcss(main, *selectors, **kwargs):
html_only = kwargs.pop('html_only', False)
result = select_ids(main, html_only)
for selector in selectors:
assert select_ids(selector, html_only) == result
self.query = query
self.html = html
self.dom = None
self.search_results = {}
self.num_results_for_query = ''
self.num_results = 0
self.effective_query = ''
self.page_number = -1
self.no_results = False
# to be set by the implementing sub classes
self.search_engine = ''
# short alias because we use it so extensively
self.css_to_xpath = HTMLTranslator().css_to_xpath
if self.html:
self.parse()
from cssselect import HTMLTranslator
class XpathTranslator(HTMLTranslator):
"""
Custom xpath translator
"""
def pseudo_matches_if_exists(self, xpath):
"""
Returns the default xpath
"""
return xpath
xpath_link_pseudo = pseudo_matches_if_exists
xpath_visited_pseudo = pseudo_matches_if_exists
xpath_hover_pseudo = pseudo_matches_if_exists
xpath_active_pseudo = pseudo_matches_if_exists
xpath_focus_pseudo = pseudo_matches_if_exists
xpath_target_pseudo = pseudo_matches_if_exists
xpath_enabled_pseudo = pseudo_matches_if_exists
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
raise ExpressionError(
"Expected a single string or ident for ::attr(), got %r"
% function.arguments)
return ScrapyXPathExpr.from_xpath(xpath,
attribute=function.arguments[0].value)
def xpath_text_simple_pseudo_element(self, xpath):
"""Support selecting text nodes using ::text pseudo-element"""
return ScrapyXPathExpr.from_xpath(xpath, textnode=True)
class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator):
pass
class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator):
pass
class CSSSelectorMixin(object):
def select(self, css):
xpath = self._css2xpath(css)
return super(CSSSelectorMixin, self).select(xpath)
def _css2xpath(self, css):
return self.translator.css_to_xpath(css)
class CSSSelector(CSSSelectorMixin, XPathSelector):
translator = ScrapyHTMLTranslator()
def find_page_breaks(self, item):
if self.page_break_selectors is None:
from calibre.ebooks.oeb.stylizer import fix_namespace
css_to_xpath = HTMLTranslator().css_to_xpath
self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower()
try:
if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
True))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before')
except:
pass
def mutate_selector_del(selector, method, expression):
"""Under the covers, Selectors contain an lxml.etree.Element document
root, which is not exposed by the Selector interface. This is mutatable
using the .remove method on parts of the selector.root document tree.
Unfortunately, there is no native content removal interface in scrapy.
As this is not using a published interface for Selector, it must be
considered risky. In particular, it is feasible (though not likely) that
scrapy could change its selector implementation to use a different
HTML/XML parsing library, at which point this would fail.
"""
try:
if method == 'xpath':
s = expression
elif method == 'css':
s = HTMLTranslator().css_to_xpath(expression)
else:
raise NotImplementedError
for node in selector.root.xpath(s):
node.getparent().remove(node)
except Exception as e:
logger.error('mutate_selector_del({}, {}, {},) failed: {}'.format(
selector,
method,
expression,
e))
raise ExpressionError(
"Expected a single string or ident for ::attr(), got %r"
% function.arguments)
return ScrapyXPathExpr.from_xpath(xpath,
attribute=function.arguments[0].value)
def xpath_text_simple_pseudo_element(self, xpath):
"""Support selecting text nodes using ::text pseudo-element"""
return ScrapyXPathExpr.from_xpath(xpath, textnode=True)
class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator):
pass
class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator):
pass
def process_query(self, query):
xpath_query = HTMLTranslator().css_to_xpath(query)
return super(CssSelector, self).process_query(xpath_query)