How to use extruct - 10 common examples

To help you get started, we’ve selected a few extruct examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapinghub / extruct / extruct / rdflibxml / utils.py View on Github external
# Note the removal of the fragment ID. This is necessary, per the HTTP spec
            url = name.split('#')[0]
            if socket.getfqdn().endswith('.w3.org'):
                import checkremote
                checkremote.check_url_safety(url)
            if 'Accept' not in additional_headers:
                additional_headers['Accept'] = 'text/html, application/xhtml+xml'

            import requests
            r = requests.get(url, headers=additional_headers)
            self.data    = r.content
            self.headers    = r.headers

            if URIOpener.CONTENT_TYPE in self.headers :
                # The call below will remove the possible media type parameters, like charset settings
                ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
                self.content_type = ct.media_type
                if 'charset' in ct.parmdict :
                    self.charset = ct.parmdict['charset']
                else :
                    self.charset = None
                # print
            else :
                # check if the suffix can be used for the content type; this may be important
                # for file:// type URI or if the server is not properly set up to return the right
                # mime type
                self.charset = None
                self.content_type = ""
                for suffix in preferred_suffixes.keys() :
                    if name.endswith(suffix) :
                        self.content_type = preferred_suffixes[suffix]
                        break
github scrapinghub / extruct / extruct / __init__.py View on Github external
{'@context': 'http://example.com', 
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(('microdata', MicrodataExtractor().extract_items))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items))
    if 'opengraph' in syntaxes:
        processors.append(('opengraph', OpenGraphExtractor().extract_items))
    if 'microformat' in syntaxes:
        processors.append(('microformat', MicroformatExtractor().extract_items))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items))
    output = {}
    for label, extract in processors:
        try:
            output[label] = [obj for obj in extract(document=tree,
                                                    url=url,
                                                    html=htmlstring)]
        except Exception:
            if errors == 'log':
github scrapinghub / extruct / extruct / rdflibxml / extras / httpheader.py View on Github external
always be processed, but be at a lower priority than a complete
    matching type.

    See also: RFC 2616 section 14.1, and
    

    """
    if _is_string(accept_header):
        accept_list = parse_accept_header(accept_header)
    else:
        accept_list = accept_header

    if _is_string(content_types):
        content_types = [content_types]

    server_ctlist = [content_type(ct) for ct in content_types]
    del ct

    #print 'AC', repr(accept_list)
    #print 'SV', repr(server_ctlist)

    best = None   # (content_type, qvalue, accept_parms, matchlen)

    for server_ct in server_ctlist:
        best_for_this = None
        for client_ct, qvalue, aargs in accept_list:
            if ignore_wildcard and client_ct.is_universal_wildcard():
                continue  # */* being ignored

            matchlen = 0 # how specifically this one matches (0 is a non-match)
            if client_ct.is_universal_wildcard():
                matchlen = 1   # */* is a 1
github scrapinghub / extruct / extruct / rdflibxml / utils.py View on Github external
self.charset = ct.parmdict['charset']
                else :
                    self.charset = None
                # print
            else :
                # check if the suffix can be used for the content type; this may be important
                # for file:// type URI or if the server is not properly set up to return the right
                # mime type
                self.charset = None
                self.content_type = ""
                for suffix in preferred_suffixes.keys() :
                    if name.endswith(suffix) :
                        self.content_type = preferred_suffixes[suffix]
                        break

            if URIOpener.CONTENT_LOCATION in self.headers :
                self.location = urljoin(r.url,self.headers[URIOpener.CONTENT_LOCATION])
            else :
                self.location = name

            self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
            if URIOpener.EXPIRES in self.headers :
                try :
                    # Thanks to Deron Meranda for the HTTP date conversion method...
                    self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
                except :
                    # The Expires date format was wrong, sorry, forget it...
                    pass

            self.last_modified_date = None
            if URIOpener.LAST_MODIFIED in self.headers :
                try :
github scrapinghub / extruct / extruct / __init__.py View on Github external
/* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(('microdata', MicrodataExtractor().extract_items))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items))
    if 'opengraph' in syntaxes:
        processors.append(('opengraph', OpenGraphExtractor().extract_items))
    if 'microformat' in syntaxes:
        processors.append(('microformat', MicroformatExtractor().extract_items))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items))
    output = {}
    for label, extract in processors:
        try:
            output[label] = [obj for obj in extract(document=tree,
                                                    url=url,
                                                    html=htmlstring)]
        except Exception:
            if errors == 'log':
                logger.exception("Failed to extract {} from {}".format(label, url))
            if errors == 'ignore':
github scrapinghub / extruct / tests / test_extruct_uniform.py View on Github external
def test_microformat(self):
        body = get_testdata('misc', 'microformat_test.html')
        expected = json.loads(get_testdata('misc', 'microformat_flat_test.json').decode('UTF-8'))
        data = extruct.extract(body, uniform=True)
        self.assertEqual(jsonize_dict(data['microformat']), expected)
github scrapinghub / extruct / tests / test_uniform.py View on Github external
"aggregateRating": {
                          "@type": "AggregateRating",
                          "ratingValue": "4.4",
                          "reviewCount": "89"},
                      "offers": {
                          "@type": "Offer",
                          "priceCurrency": "USD",
                          "price": "119.99",
                          "priceValidUntil": "2020-11-05",
                          "seller": {"@type": "Organization",
                                     "name": "Executive Objects"},
                          "itemCondition": "http://schema.org/UsedCondition",
                          "availability": "http://schema.org/InStock"
                          }}]
        body = get_testdata('misc', 'product_microdata.html')
        data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
        self.assertEqual(data['microdata'], expected)
github scrapinghub / extruct / tests / test_extruct.py View on Github external
def test_extra_kwargs(self):
        body, expected = self._microdata_custom_url('product_custom_url.json')
        with self.assertRaises(TypeError):
            extruct.extract(body, foo='bar')
github scrapinghub / extruct / tests / test_extruct.py View on Github external
def test_errors(self):
        body = ''

        # raise exceptions
        with pytest.raises(Exception):
            data = extruct.extract(body)

        # ignore exceptions
        expected = {}
        data = extruct.extract(body, errors='ignore')
        assert data == expected

        # ignore exceptions
        data = extruct.extract(body, errors='log')
        assert data == expected
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_join_none(self):
        body = get_testdata('schema.org', 'product-ref.html')
        expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)
        self.assertEqual(data, expected)