How to use the tldextract.tldextract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

gumob / music-dl / music_dl / MusicDL.py View on Github

logger.setLevel(logging.INFO)

        """ Print version """

        logger.info(pkg_resources.require("music_dl")[0])

        """ Validate parameters """

        logger.info('Validating parameters...')

        try:
            # Validate download url
            url_parsed = urlparse(self.download_url)
            if not url_parsed.scheme.startswith('http'):
                raise DirectoryException('Invalid URL. URL must start with http*. Input value is {}'.format(self.download_url))
            tld_parsed = tldextract.extract(self.download_url)
            if not (tld_parsed.domain in ['youtube', 'soundcloud']):
                raise DirectoryException('Invalid URL. Music Downloader supports only YouTube and SoundCloud. Input value is {}'.format(self.download_url))
            # Validate download directory
            if not is_path_exists_or_creatable(self.working_dir):
                raise DirectoryException('Invalid directory. Please specify valid download directory. Input value is {}'.format(self.working_dir))

        except DirectoryException as e:
            logger.error(e.message)
            logger.fatal('Aborted.')
            exit()

        # Validate playlist configuration
        try:
            self.playlist.validate()

        except PlaylistParameterException as e:

codelucas / newspaper / newspaper / urls.py View on Github

# if the file type is a media type, reject instantly
        if file_type and file_type not in ALLOWED_TYPES:
            if verbose: print('\t%s rejected due to bad filetype' % url)
            return False

        last_chunk = path_chunks[-1].split('.')
        # the file type is not of use to use anymore, remove from url
        if len(last_chunk) > 1:
            path_chunks[-1] = last_chunk[-2]

    # Index gives us no information
    if 'index' in path_chunks:
        path_chunks.remove('index')

    # extract the tld (top level domain)
    tld_dat = tldextract.extract(url)
    subd = tld_dat.subdomain
    tld = tld_dat.domain.lower()

    url_slug = path_chunks[-1] if path_chunks else ''

    if tld in BAD_DOMAINS:
        if verbose: print('%s caught for a bad tld' % url)
        return False

    if len(path_chunks) == 0:
        dash_count, underscore_count = 0, 0
    else:
        dash_count = url_slug.count('-')
        underscore_count = url_slug.count('_')

    # If the url has a news slug title

DeFacto / DeFacto / python / trustworthiness / util.py View on Github

except:
            continue

    # appending upper level domains, from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
    # Version 2018040300, Last Updated Tue Apr  3 07:07:01 2018 UTC
    df = pd.read_csv(config.datasets + 'data/iana/org/TLD/tlds-alpha-by-domain.txt', sep=" ", header=None)
    for index, row in df.iterrows():
        print(index, row[0])
        domain.append(str(row[0]).lower())

    df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
    for index, row in df.iterrows():
        url = str(row[3])
        print(index, url)
        try:
            o = tldextract.extract(url)
            if o.suffix is not None:
                domain_s.append(str(o.suffix).lower())
            if o.domain is not None:
                domain.append(str(o.domain).lower())
        except:
            continue


    le1.fit(domain)
    joblib.dump(le1, ENC_WEB_DOMAIN)
    print(le1.classes_)

    le2.fit(domain_s)
    joblib.dump(le2, ENC_WEB_DOMAIN_SUFFIX)
    print(le2.classes_)

codelucas / newspaper / newspaper / extractors.py View on Github

if self.config.verbose:
                    print('elim category url %s for no domain and path'
                          % p_url)
                continue
            if path and path.startswith('#'):
                if self.config.verbose:
                    print('elim category url %s path starts with #' % p_url)
                continue
            if scheme and (scheme != 'http' and scheme != 'https'):
                if self.config.verbose:
                    print(('elim category url %s for bad scheme, '
                           'not http nor https' % p_url))
                continue

            if domain:
                child_tld = tldextract.extract(p_url)
                domain_tld = tldextract.extract(source_url)
                child_subdomain_parts = child_tld.subdomain.split('.')
                subdomain_contains = False
                for part in child_subdomain_parts:
                    if part == domain_tld.domain:
                        if self.config.verbose:
                            print(('subdomain contains at %s and %s' %
                                   (str(part), str(domain_tld.domain))))
                        subdomain_contains = True
                        break

                # Ex. microsoft.com is definitely not related to
                # espn.com, but espn.go.com is probably related to espn.com
                if not subdomain_contains and \
                        (child_tld.domain != domain_tld.domain):
                    if self.config.verbose:

DeFacto / DeFacto / python / trustworthiness / features_core.py View on Github

def get_open_page_rank(self, url):
        try:
            o = tldextract.extract(url)
            domain=('%s.%s' % (o.domain, o.suffix))
            try:
                pginfo=self.page_rank.pg[domain]
            except KeyError:
                config.logger.warn('page rank information for domain [' + domain + '] not found')
                return MISSING_FEATURE * 2, True
            return [pginfo['page_rank_decimal'], pginfo['rank']], False
        except Exception as e:
            config.logger.error(repr(e))
            return MISSING_FEATURE * 2, True

DeFacto / DeFacto / python / trustworthiness / util.py View on Github

import pandas as pd
    from sklearn import preprocessing
    le1 = preprocessing.LabelEncoder()
    le2 = preprocessing.LabelEncoder()

    domain_s = ['com']
    domain_s = ['']
    domain = ['']

    df_sites = pd.read_csv(DATASET_3C_SITES_PATH, na_values=0, delimiter=',', usecols=['document_url'])
    for index, row in df_sites.iterrows():
        url = str(row[0])
        print(index, url)
        try:
            o = tldextract.extract(url)
            if o.suffix is not None:
                domain_s.append(str(o.suffix).lower())
            if o.domain is not None:
                domain.append(str(o.domain).lower())
        except:
            continue

    # appending upper level domains, from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
    # Version 2018040300, Last Updated Tue Apr  3 07:07:01 2018 UTC
    df = pd.read_csv(config.datasets + 'data/iana/org/TLD/tlds-alpha-by-domain.txt', sep=" ", header=None)
    for index, row in df.iterrows():
        print(index, row[0])
        domain.append(str(row[0]).lower())

    df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
    for index, row in df.iterrows():

robbielynch / RoblySearch / robly_crawler / crawler.py View on Github

def get_base_url(url):
    """
    Takes as input a url, returns the protocol,domain and suffix concatenated
    to form the base url of the website. Uses the tldextract library.
    """
    tld = tldextract.extract(url)
    print(tld.subdomain, ' - ', tld.domain, ' - ', tld.suffix)
    if tld.subdomain != "":
        base_url = '.'.join([tld.subdomain, tld.domain, tld.suffix])
    else:
        base_url = '.'.join([tld.domain, tld.suffix])
    return base_url

codelucas / newspaper / newspaper / source.py View on Github

self.url = url
        self.url = urls.prepare_url(url)

        self.domain = urls.get_domain(self.url)
        self.scheme = urls.get_scheme(self.url)

        self.categories = []
        self.feeds = []
        self.articles = []

        self.html = ''
        self.doc = None

        self.logo_url = ''
        self.favicon = ''
        self.brand = tldextract.extract(self.url).domain
        self.description = ''

        self.is_parsed = False
        self.is_downloaded = False

How to use the tldextract.tldextract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

tldextract

Package Health Score

Popular tldextract functions

Similar packages