Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_w3c_rdf11primer(self):
for i in [14]:
fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
body = get_testdata('w3crdfa', fileprefix + '.html')
expected = json.loads(
get_testdata('w3crdfa', fileprefix + '.expanded.json'
).decode('UTF-8'))
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
self.assertJsonLDEqual(data, expected)
def test_expanded_opengraph_support(self):
body = get_testdata('misc','expanded_OG_support_test.html')
expected = json.loads(
get_testdata('misc','expanded_OG_support_test.json'
).decode('UTF-8'))
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
self.assertJsonLDEqual(data,expected)
def test_wikipedia_xhtml_rdfa_no_prefix(self):
body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html')
expected = json.loads(
get_testdata('misc', 'Portfolio_Niels_Lubberman.json'
).decode('UTF-8'))
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')
self.assertJsonLDEqual(data, expected)
def test_wikipedia_xhtml_rdfa(self):
fileprefix = 'xhtml+rdfa'
body = get_testdata('wikipedia', fileprefix + '.html')
expected = json.loads(
get_testdata('wikipedia', fileprefix + '.expanded.json'
).decode('UTF-8'))
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
self.assertJsonLDEqual(data, expected)
def test_w3c_rdfalite(self):
for i in [3, 4, 5]:
fileprefix = 'w3c.rdfalite.example{:03d}'.format(i)
body = get_testdata('w3crdfa', fileprefix + '.html')
expected = json.loads(
get_testdata('w3crdfa', fileprefix + '.expanded.json'
).decode('UTF-8'))
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
self.assertJsonLDEqual(data, expected)
def test_w3c_rdfaprimer(self):
for i in [5, 6, 7, 8, 9, 10, 11, 15]:
fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
print(fileprefix)
body = get_testdata('w3crdfa', fileprefix + '.html')
expected = json.loads(
get_testdata('w3crdfa', fileprefix + '.expanded.json'
).decode('UTF-8'))
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
self.assertJsonLDEqual(data, expected)
# This is for testing that the fix to issue 116 does not affect
# severely rdfa output even in a presence of a bug in the code
def mocked_fix_order(x, y, z):
raise Exception()
rdfae._fix_order = mocked_fix_order
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
self.assertJsonLDEqual(data, expected)
def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
self.response = response
self.microdata = microdata
self.jsonld = jsonld
self.rdfa = rdfa
if rdfa:
try:
self.rdfae = RDFaExtractor()
self.rdfadata = self.rdfae.extract(self.response.text,
url=self.response.url)
except JSONDecodeError:
pass
if microdata:
try:
self.mde = MicrodataExtractor()
self.mdedata = self.mde.extract(self.response.text)
except JSONDecodeError:
pass
if jsonld:
try:
self.jlde = JsonLdExtractor()
self.jldata = self.jlde.extract(self.response.text)
except (JSONDecodeError, TypeError):
self.jldata = []
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
output = {}
for label, extract in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
except Exception:
if errors == 'log':
logger.exception("Failed to extract {} from {}".format(label, url))
if errors == 'ignore':
pass
if errors == 'strict':
raise
if uniform:
if 'microdata' in syntaxes:
))
if 'opengraph' in syntaxes:
processors.append(
('opengraph',
OpenGraphExtractor().extract_items,
tree
))
if 'microformat' in syntaxes:
processors.append(
('microformat',
MicroformatExtractor().extract_items,
htmlstring
))
if 'rdfa' in syntaxes:
processors.append(
('rdfa', RDFaExtractor().extract_items,
tree,
))
output = {}
for syntax, extract, document in processors:
try:
output[syntax] = list(extract(document, base_url=base_url))
except Exception as e:
if errors == 'log':
logger.exception('Failed to extract {}, raises {}'
.format(syntax, e)
)
if errors == 'ignore':
pass
if errors == 'strict':
raise