Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Note the removal of the fragment ID. This is necessary, per the HTTP spec
url = name.split('#')[0]
if socket.getfqdn().endswith('.w3.org'):
import checkremote
checkremote.check_url_safety(url)
if 'Accept' not in additional_headers:
additional_headers['Accept'] = 'text/html, application/xhtml+xml'
import requests
r = requests.get(url, headers=additional_headers)
self.data = r.content
self.headers = r.headers
if URIOpener.CONTENT_TYPE in self.headers :
# The call below will remove the possible media type parameters, like charset settings
ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
self.content_type = ct.media_type
if 'charset' in ct.parmdict :
self.charset = ct.parmdict['charset']
else :
self.charset = None
# print
else :
# check if the suffix can be used for the content type; this may be important
# for file:// type URI or if the server is not properly set up to return the right
# mime type
self.charset = None
self.content_type = ""
for suffix in preferred_suffixes.keys() :
if name.endswith(suffix) :
self.content_type = preferred_suffixes[suffix]
break
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}
schema_context: schema's context for current page"""
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
output = {}
for label, extract in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
except Exception:
if errors == 'log':
always be processed, but be at a lower priority than a complete
matching type.
See also: RFC 2616 section 14.1, and
"""
if _is_string(accept_header):
accept_list = parse_accept_header(accept_header)
else:
accept_list = accept_header
if _is_string(content_types):
content_types = [content_types]
server_ctlist = [content_type(ct) for ct in content_types]
del ct
#print 'AC', repr(accept_list)
#print 'SV', repr(server_ctlist)
best = None # (content_type, qvalue, accept_parms, matchlen)
for server_ct in server_ctlist:
best_for_this = None
for client_ct, qvalue, aargs in accept_list:
if ignore_wildcard and client_ct.is_universal_wildcard():
continue # */* being ignored
matchlen = 0 # how specifically this one matches (0 is a non-match)
if client_ct.is_universal_wildcard():
matchlen = 1 # */* is a 1
self.charset = ct.parmdict['charset']
else :
self.charset = None
# print
else :
# check if the suffix can be used for the content type; this may be important
# for file:// type URI or if the server is not properly set up to return the right
# mime type
self.charset = None
self.content_type = ""
for suffix in preferred_suffixes.keys() :
if name.endswith(suffix) :
self.content_type = preferred_suffixes[suffix]
break
if URIOpener.CONTENT_LOCATION in self.headers :
self.location = urljoin(r.url,self.headers[URIOpener.CONTENT_LOCATION])
else :
self.location = name
self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
if URIOpener.EXPIRES in self.headers :
try :
# Thanks to Deron Meranda for the HTTP date conversion method...
self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
except :
# The Expires date format was wrong, sorry, forget it...
pass
self.last_modified_date = None
if URIOpener.LAST_MODIFIED in self.headers :
try :
/* All other the properties in keys here */
}
schema_context: schema's context for current page"""
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
output = {}
for label, extract in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
except Exception:
if errors == 'log':
logger.exception("Failed to extract {} from {}".format(label, url))
if errors == 'ignore':
def test_microformat(self):
body = get_testdata('misc', 'microformat_test.html')
expected = json.loads(get_testdata('misc', 'microformat_flat_test.json').decode('UTF-8'))
data = extruct.extract(body, uniform=True)
self.assertEqual(jsonize_dict(data['microformat']), expected)
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4.4",
"reviewCount": "89"},
"offers": {
"@type": "Offer",
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {"@type": "Organization",
"name": "Executive Objects"},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}}]
body = get_testdata('misc', 'product_microdata.html')
data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
self.assertEqual(data['microdata'], expected)
def test_extra_kwargs(self):
body, expected = self._microdata_custom_url('product_custom_url.json')
with self.assertRaises(TypeError):
extruct.extract(body, foo='bar')
def test_errors(self):
body = ''
# raise exceptions
with pytest.raises(Exception):
data = extruct.extract(body)
# ignore exceptions
expected = {}
data = extruct.extract(body, errors='ignore')
assert data == expected
# ignore exceptions
data = extruct.extract(body, errors='log')
assert data == expected
def test_join_none(self):
body = get_testdata('schema.org', 'product-ref.html')
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))
mde = MicrodataExtractor()
data = mde.extract(body)
self.assertEqual(data, expected)