How to use the tldextract.tldextract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github gumob / music-dl / music_dl / MusicDL.py View on Github external
logger.setLevel(logging.INFO)

        """ Print version """

        logger.info(pkg_resources.require("music_dl")[0])

        """ Validate parameters """

        logger.info('Validating parameters...')

        try:
            # Validate download url
            url_parsed = urlparse(self.download_url)
            if not url_parsed.scheme.startswith('http'):
                raise DirectoryException('Invalid URL. URL must start with http*. Input value is {}'.format(self.download_url))
            tld_parsed = tldextract.extract(self.download_url)
            if not (tld_parsed.domain in ['youtube', 'soundcloud']):
                raise DirectoryException('Invalid URL. Music Downloader supports only YouTube and SoundCloud. Input value is {}'.format(self.download_url))
            # Validate download directory
            if not is_path_exists_or_creatable(self.working_dir):
                raise DirectoryException('Invalid directory. Please specify valid download directory. Input value is {}'.format(self.working_dir))

        except DirectoryException as e:
            logger.error(e.message)
            logger.fatal('Aborted.')
            exit()

        # Validate playlist configuration
        try:
            self.playlist.validate()

        except PlaylistParameterException as e:
github codelucas / newspaper / newspaper / urls.py View on Github external
# if the file type is a media type, reject instantly
        if file_type and file_type not in ALLOWED_TYPES:
            if verbose: print('\t%s rejected due to bad filetype' % url)
            return False

        last_chunk = path_chunks[-1].split('.')
        # the file type is not of use to use anymore, remove from url
        if len(last_chunk) > 1:
            path_chunks[-1] = last_chunk[-2]

    # Index gives us no information
    if 'index' in path_chunks:
        path_chunks.remove('index')

    # extract the tld (top level domain)
    tld_dat = tldextract.extract(url)
    subd = tld_dat.subdomain
    tld = tld_dat.domain.lower()

    url_slug = path_chunks[-1] if path_chunks else ''

    if tld in BAD_DOMAINS:
        if verbose: print('%s caught for a bad tld' % url)
        return False

    if len(path_chunks) == 0:
        dash_count, underscore_count = 0, 0
    else:
        dash_count = url_slug.count('-')
        underscore_count = url_slug.count('_')

    # If the url has a news slug title
github DeFacto / DeFacto / python / trustworthiness / util.py View on Github external
except:
            continue

    # appending upper level domains, from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
    # Version 2018040300, Last Updated Tue Apr  3 07:07:01 2018 UTC
    df = pd.read_csv(config.datasets + 'data/iana/org/TLD/tlds-alpha-by-domain.txt', sep=" ", header=None)
    for index, row in df.iterrows():
        print(index, row[0])
        domain.append(str(row[0]).lower())

    df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
    for index, row in df.iterrows():
        url = str(row[3])
        print(index, url)
        try:
            o = tldextract.extract(url)
            if o.suffix is not None:
                domain_s.append(str(o.suffix).lower())
            if o.domain is not None:
                domain.append(str(o.domain).lower())
        except:
            continue


    le1.fit(domain)
    joblib.dump(le1, ENC_WEB_DOMAIN)
    print(le1.classes_)

    le2.fit(domain_s)
    joblib.dump(le2, ENC_WEB_DOMAIN_SUFFIX)
    print(le2.classes_)
github codelucas / newspaper / newspaper / extractors.py View on Github external
if self.config.verbose:
                    print('elim category url %s for no domain and path'
                          % p_url)
                continue
            if path and path.startswith('#'):
                if self.config.verbose:
                    print('elim category url %s path starts with #' % p_url)
                continue
            if scheme and (scheme != 'http' and scheme != 'https'):
                if self.config.verbose:
                    print(('elim category url %s for bad scheme, '
                           'not http nor https' % p_url))
                continue

            if domain:
                child_tld = tldextract.extract(p_url)
                domain_tld = tldextract.extract(source_url)
                child_subdomain_parts = child_tld.subdomain.split('.')
                subdomain_contains = False
                for part in child_subdomain_parts:
                    if part == domain_tld.domain:
                        if self.config.verbose:
                            print(('subdomain contains at %s and %s' %
                                   (str(part), str(domain_tld.domain))))
                        subdomain_contains = True
                        break

                # Ex. microsoft.com is definitely not related to
                # espn.com, but espn.go.com is probably related to espn.com
                if not subdomain_contains and \
                        (child_tld.domain != domain_tld.domain):
                    if self.config.verbose:
github DeFacto / DeFacto / python / trustworthiness / features_core.py View on Github external
def get_open_page_rank(self, url):
        try:
            o = tldextract.extract(url)
            domain=('%s.%s' % (o.domain, o.suffix))
            try:
                pginfo=self.page_rank.pg[domain]
            except KeyError:
                config.logger.warn('page rank information for domain [' + domain + '] not found')
                return MISSING_FEATURE * 2, True
            return [pginfo['page_rank_decimal'], pginfo['rank']], False
        except Exception as e:
            config.logger.error(repr(e))
            return MISSING_FEATURE * 2, True
github DeFacto / DeFacto / python / trustworthiness / util.py View on Github external
import pandas as pd
    from sklearn import preprocessing
    le1 = preprocessing.LabelEncoder()
    le2 = preprocessing.LabelEncoder()

    domain_s = ['com']
    domain_s = ['']
    domain = ['']

    df_sites = pd.read_csv(DATASET_3C_SITES_PATH, na_values=0, delimiter=',', usecols=['document_url'])
    for index, row in df_sites.iterrows():
        url = str(row[0])
        print(index, url)
        try:
            o = tldextract.extract(url)
            if o.suffix is not None:
                domain_s.append(str(o.suffix).lower())
            if o.domain is not None:
                domain.append(str(o.domain).lower())
        except:
            continue

    # appending upper level domains, from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
    # Version 2018040300, Last Updated Tue Apr  3 07:07:01 2018 UTC
    df = pd.read_csv(config.datasets + 'data/iana/org/TLD/tlds-alpha-by-domain.txt', sep=" ", header=None)
    for index, row in df.iterrows():
        print(index, row[0])
        domain.append(str(row[0]).lower())

    df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
    for index, row in df.iterrows():
github robbielynch / RoblySearch / robly_crawler / crawler.py View on Github external
def get_base_url(url):
    """
    Takes as input a url, returns the protocol,domain and suffix concatenated
    to form the base url of the website. Uses the tldextract library.
    """
    tld = tldextract.extract(url)
    print(tld.subdomain, ' - ', tld.domain, ' - ', tld.suffix)
    if tld.subdomain != "":
        base_url = '.'.join([tld.subdomain, tld.domain, tld.suffix])
    else:
        base_url = '.'.join([tld.domain, tld.suffix])
    return base_url
github codelucas / newspaper / newspaper / source.py View on Github external
self.url = url
        self.url = urls.prepare_url(url)

        self.domain = urls.get_domain(self.url)
        self.scheme = urls.get_scheme(self.url)

        self.categories = []
        self.feeds = []
        self.articles = []

        self.html = ''
        self.doc = None

        self.logo_url = ''
        self.favicon = ''
        self.brand = tldextract.extract(self.url).domain
        self.description = ''

        self.is_parsed = False
        self.is_downloaded = False

tldextract

Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.

BSD-3-Clause
Latest version published 2 months ago

Package Health Score

86 / 100
Full package analysis

Similar packages