Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_microformat(self):
body = get_testdata('misc', 'microformat_test.html')
expected = json.loads(get_testdata('misc', 'microformat_flat_test.json').decode('UTF-8'))
data = extruct.extract(body, uniform=True)
self.assertEqual(jsonize_dict(data['microformat']), expected)
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4.4",
"reviewCount": "89"},
"offers": {
"@type": "Offer",
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {"@type": "Organization",
"name": "Executive Objects"},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}}]
body = get_testdata('misc', 'product_microdata.html')
data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
self.assertEqual(data['microdata'], expected)
def test_extra_kwargs(self):
body, expected = self._microdata_custom_url('product_custom_url.json')
with self.assertRaises(TypeError):
extruct.extract(body, foo='bar')
def test_errors(self):
body = ''
# raise exceptions
with pytest.raises(Exception):
data = extruct.extract(body)
# ignore exceptions
expected = {}
data = extruct.extract(body, errors='ignore')
assert data == expected
# ignore exceptions
data = extruct.extract(body, errors='log')
assert data == expected
queue = [initial]
visited = set()
while len(queue) > 0 and len(visited) < limit:
next = queue.pop()
if next in visited:
continue
visited.add(next)
print(f'Calling {next}', file=sys.stderr)
try:
response = requests.get(next)
base_url = get_base_url(response.text, response.url)
data = extruct.extract(response.text, base_url=base_url, syntaxes=['json-ld'])
if len(data['json-ld']) > 0:
output.append(data['json-ld'][0])
soup = BeautifulSoup(response.text, 'html5lib')
for link in soup.find_all('a'):
if not 'href' in link.attrs:
continue
linkurl = urllib.parse.urljoin(base_url, link['href'])
if linkurl in visited:
continue
if urlpatterns:
for pat in urlpatterns:
if pat(linkurl):
queue.insert(0, linkurl.split('?')[0])
def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False,
schema_context='http://schema.org', errors='strict'):
resp = requests.get(url, timeout=30)
result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
try:
resp.raise_for_status()
except requests.exceptions.HTTPError:
return result
result.update(extruct.extract(resp.content,
base_url=url, # FIXME: use base url
syntaxes=syntaxes,
uniform=uniform,
schema_context=schema_context,
errors=errors))
return result
def perform(kls, inputs):
url = inputs['target:url']
try:
r = requests.get(url)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url, syntaxes=['json-ld'])['json-ld']
tree = Tree(data)
except:
data = None
return dict(
**{
'metric:30': {
'answer': 'yes' if data else 'no',
'comment': 'jsonld was found and properly parsed' if data else 'jsonld could not be parsed',
},
},
**{
key: {
'answer': 'yes' if attr else 'no',
'comment': attr if attr else 'json-ld %s not found' % (' '.join(to_schema[key])),
} if key.startswith('metric:') else attr
def _ProcessDspl2File(filename, fileobj, *, type=''):
if any([filename.endswith('.html'),
type.startswith('text/html')]):
data = extruct.extract(file.read(), uniform='True')
return LoadGraph({
'@context': 'http://schema.org',
'@graph': [
subdata_elem
for subdata in data.values()
for subdata_elem in subdata
if subdata
]
}, filename)
if any([filename.endswith('.json'),
filename.endswith('.jsonld'),
type.startswith('application/ld+json')]):
json_val = json.load(fileobj)
return LoadGraph(json_val, filename)
no results - an empty list will be returned
"""
if not isinstance(url, str):
raise TypeError('url is type "{}", a string was expected'
''.format(type(url)))
data = {} # type: Dict[str, List[Dict]]
if not user_agent_str:
user_agent_str = USER_AGENT_STR
r = requests.get(url, headers={'User-Agent': user_agent_str})
r.raise_for_status()
data = extruct.extract(r.text, r.url)
url = r.url
scrapings = _convert_to_scrapings(data, nonstandard_attrs, url=url)
if migrate_old_schema is True:
scrapings = _migrate_old_schema(scrapings)
if python_objects is not False:
scrapings = _pythonize_objects(scrapings, python_objects)
return scrapings