How to use the tldextract.tldextract._PublicSuffixListTLDExtractor function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github john-kurkowski / tldextract / tldextract / tldextract.py View on Github external
self._extractor = _PublicSuffixListTLDExtractor(tlds)
            return self._extractor
        elif self.suffix_list_urls:
            raw_suffix_list_data = find_first_response(
                self.suffix_list_urls,
                self.cache_fetch_timeout
            )
            tlds = get_tlds_from_raw_suffix_list_data(
                raw_suffix_list_data,
                self.include_psl_private_domains
            )

        if not tlds and self.fallback_to_snapshot:
            tlds = self._get_snapshot_tld_extractor()
            tlds.extend(self.extra_suffixes)
            self._extractor = _PublicSuffixListTLDExtractor(tlds)
            return self._extractor
        elif not tlds:
            raise Exception("tlds is empty, but fallback_to_snapshot is set"
                            " to false. Cannot proceed without tlds.")

        self._cache_tlds(tlds)

        tlds.extend(self.extra_suffixes)
        self._extractor = _PublicSuffixListTLDExtractor(tlds)
        return self._extractor
github robbielynch / RoblySearch / tldextract / tldextract.py View on Github external
snapshot = sorted(pickle.load(snapshot_file))
            new = sorted(tlds)
            for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=self.cache_file):
                if sys.version_info < (3,):
                    sys.stderr.write(line.encode('utf-8') + "\n")
                else:
                    sys.stderr.write(line + "\n")

        if self.cache_file:
            try:
                with open(self.cache_file, 'wb') as f:
                    pickle.dump(tlds, f)
            except IOError as e:
                LOG.warn("unable to cache TLDs in file %s: %s", self.cache_file, e)

        self._extractor = _PublicSuffixListTLDExtractor(tlds)
        return self._extractor
github john-kurkowski / tldextract / tldextract / tldextract.py View on Github external
self.include_psl_private_domains
            )

        if not tlds and self.fallback_to_snapshot:
            tlds = self._get_snapshot_tld_extractor()
            tlds.extend(self.extra_suffixes)
            self._extractor = _PublicSuffixListTLDExtractor(tlds)
            return self._extractor
        elif not tlds:
            raise Exception("tlds is empty, but fallback_to_snapshot is set"
                            " to false. Cannot proceed without tlds.")

        self._cache_tlds(tlds)

        tlds.extend(self.extra_suffixes)
        self._extractor = _PublicSuffixListTLDExtractor(tlds)
        return self._extractor
github john-kurkowski / tldextract / tldextract / tldextract.py View on Github external
def _get_tld_extractor(self):
        '''Get or compute this object's TLDExtractor. Looks up the TLDExtractor
        in roughly the following order, based on the settings passed to
        __init__:

        1. Memoized on `self`
        2. Local system cache file
        3. Remote PSL, over HTTP
        4. Bundled PSL snapshot file'''
        if self._extractor:
            return self._extractor

        tlds = self._get_cached_tlds()
        if tlds:
            tlds.extend(self.extra_suffixes)
            self._extractor = _PublicSuffixListTLDExtractor(tlds)
            return self._extractor
        elif self.suffix_list_urls:
            raw_suffix_list_data = find_first_response(
                self.suffix_list_urls,
                self.cache_fetch_timeout
            )
            tlds = get_tlds_from_raw_suffix_list_data(
                raw_suffix_list_data,
                self.include_psl_private_domains
            )

        if not tlds and self.fallback_to_snapshot:
            tlds = self._get_snapshot_tld_extractor()
            tlds.extend(self.extra_suffixes)
            self._extractor = _PublicSuffixListTLDExtractor(tlds)
            return self._extractor
github robbielynch / RoblySearch / tldextract / tldextract.py View on Github external
except IOError as ioe:
                file_not_found = ioe.errno == errno.ENOENT
                if not file_not_found:
                  LOG.error("error reading TLD cache file %s: %s", self.cache_file, ioe)
            except Exception as ex:
                LOG.error("error reading TLD cache file %s: %s", self.cache_file, ex)

        tlds = frozenset()
        if self.suffix_list_urls:
            raw_suffix_list_data = fetch_file(self.suffix_list_urls)
            tlds = get_tlds_from_raw_suffix_list_data(raw_suffix_list_data)

        if not tlds:
            if self.fallback_to_snapshot:
                with closing(pkg_resources.resource_stream(__name__, '.tld_set_snapshot')) as snapshot_file:
                    self._extractor = _PublicSuffixListTLDExtractor(pickle.load(snapshot_file))
                    return self._extractor
            else:
                raise Exception("tlds is empty, but fallback_to_snapshot is set"
                                " to false. Cannot proceed without tlds.")

        LOG.info("computed TLDs: [%s, ...]", ', '.join(list(tlds)[:10]))
        if LOG.isEnabledFor(logging.DEBUG):
            import difflib
            with closing(pkg_resources.resource_stream(__name__, '.tld_set_snapshot')) as snapshot_file:
                snapshot = sorted(pickle.load(snapshot_file))
            new = sorted(tlds)
            for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=self.cache_file):
                if sys.version_info < (3,):
                    sys.stderr.write(line.encode('utf-8') + "\n")
                else:
                    sys.stderr.write(line + "\n")
github robbielynch / RoblySearch / tldextract / tldextract.py View on Github external
def _get_tld_extractor(self):

        if self._extractor:
            return self._extractor

        if self.cache_file:
            try:
                with open(self.cache_file) as f:
                    self._extractor = _PublicSuffixListTLDExtractor(pickle.load(f))
                    return self._extractor
            except IOError as ioe:
                file_not_found = ioe.errno == errno.ENOENT
                if not file_not_found:
                  LOG.error("error reading TLD cache file %s: %s", self.cache_file, ioe)
            except Exception as ex:
                LOG.error("error reading TLD cache file %s: %s", self.cache_file, ex)

        tlds = frozenset()
        if self.suffix_list_urls:
            raw_suffix_list_data = fetch_file(self.suffix_list_urls)
            tlds = get_tlds_from_raw_suffix_list_data(raw_suffix_list_data)

        if not tlds:
            if self.fallback_to_snapshot:
                with closing(pkg_resources.resource_stream(__name__, '.tld_set_snapshot')) as snapshot_file:

tldextract

Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.

BSD-3-Clause
Latest version published 4 days ago

Package Health Score

88 / 100
Full package analysis

Similar packages