Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
/* All other the properties in keys here */
}
schema_context: schema's context for current page"""
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
output = {}
for label, extract in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
except Exception:
if errors == 'log':
logger.exception("Failed to extract {} from {}".format(label, url))
if errors == 'ignore':
def _check_jsonld(self, body, expected):
jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)
if rdfa:
try:
self.rdfae = RDFaExtractor()
self.rdfadata = self.rdfae.extract(self.response.text,
url=self.response.url)
except JSONDecodeError:
pass
if microdata:
try:
self.mde = MicrodataExtractor()
self.mdedata = self.mde.extract(self.response.text)
except JSONDecodeError:
pass
if jsonld:
try:
self.jlde = JsonLdExtractor()
self.jldata = self.jlde.extract(self.response.text)
except (JSONDecodeError, TypeError):
self.jldata = []
finally:
# Sometimes we get this in the meta dict from RISJExtractJSONLD
self.jldata.extend(self.response.meta.get('json-ld', []))
'Failed to parse html, raises {}'.format(e))
return {}
if errors == 'strict':
raise
processors = []
if 'microdata' in syntaxes:
processors.append(
('microdata',
MicrodataExtractor(add_html_node=return_html_node).extract_items,
tree
))
if 'json-ld' in syntaxes:
processors.append(
('json-ld',
JsonLdExtractor().extract_items,
tree,
))
if 'opengraph' in syntaxes:
processors.append(
('opengraph',
OpenGraphExtractor().extract_items,
tree
))
if 'microformat' in syntaxes:
processors.append(
('microformat',
MicroformatExtractor().extract_items,
htmlstring
))
if 'rdfa' in syntaxes:
processors.append(