Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}
schema_context: schema's context for current page"""
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
output = {}
for label, extract in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
except Exception:
if errors == 'log':
def test_join_none(self):
body = get_testdata('schema.org', 'product-ref.html')
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))
mde = MicrodataExtractor()
data = mde.extract(body)
self.assertEqual(data, expected)
def test_w3c_5_2(self):
body = get_testdata('w3c', 'microdata.5.2.html')
expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8'))
mde = MicrodataExtractor(nested=False, strict=True)
data = mde.extract(body)
self.assertEqual(data, expected)
def test_w3c_5_3(self):
body = get_testdata('w3c', 'microdata.5.3.html')
expected = json.loads(get_testdata('w3c', 'microdata.5.3.json').decode('UTF-8'))
mde = MicrodataExtractor(strict=True)
data = mde.extract(body)
self.assertEqual(data, expected)
def test_w3c_textContent_values(self):
body = get_testdata('w3c', 'microdata.4.2.strings.html')
expected = json.loads(get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8'))
mde = MicrodataExtractor(strict=True)
data = mde.extract(body)
self.assertEqual(data, expected)
def test_schemaorg_MusicRecording(self):
for i in [1]:
body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i))
expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))
mde = MicrodataExtractor()
data = mde.extract(body)
self.assertEqual(data, expected)
def test_w3c_object_element(self):
body = get_testdata('w3c', 'microdata.object.html')
expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))
mde = MicrodataExtractor(strict=True)
data = mde.extract(body, 'http://www.example.com/microdata/test')
self.assertEqual(data, expected)
def test_w3c_7_1(self):
body = get_testdata('w3c', 'microdata.7.1.html')
expected = json.loads(get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8'))
mde = MicrodataExtractor(nested=False, strict=True)
data = mde.extract(body, 'http://blog.example.com/progress-report')
self.assertEqual(data, expected)
def test_w3c_data_element(self):
body = get_testdata('w3c', 'microdata.4.2.data.html')
expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8'))
mde = MicrodataExtractor(strict=True)
data = mde.extract(body)
self.assertEqual(data, expected)
def test_join_custom_url(self):
body = get_testdata('schema.org', 'product.html')
expected = json.loads(get_testdata('schema.org', 'product_custom_url.json').decode('UTF-8'))
mde = MicrodataExtractor()
data = mde.extract(body, base_url='http://some-example.com')
self.assertEqual(data, expected)