How to use the tldextract.extract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

gwen001 / pentest-tools / csp-analyzer.py View on Github

def getWarningLevel( t_tld_orig, item ):
    w_level = 0
    
    if item in t_help:
        return 0

    if not item.startswith('http'):
        item = 'https://'+item
    
    tmp_parse = urlparse( item )
    tmp_tld = tldextract.extract( tmp_parse.netloc )
    # print(tmp_parse)

    if tmp_tld.subdomain == t_tld_orig.subdomain and tmp_tld.domain == t_tld_orig.domain and tmp_tld.suffix == t_tld_orig.suffix:
        w_level = 1
    elif tmp_tld.domain == t_tld_orig.domain and tmp_tld.suffix == t_tld_orig.suffix:
        w_level = 2
    else:
        w_level = 3

    if '*' in tmp_parse.netloc:
        w_level+=1

    return w_level

gwen001 / pentest-tools / cloudflare-origin-ip.py View on Github

def grabSubs( domain ):
    print( "[+] Grabbing subdomains from crt.sh: %s" % domain )
    url = 'https://crt.sh/?q=%25.' + domain + '&output=json'
    try:
        ex = 0
        r = requests.get( url )
    except Exception as e:
        ex = 1
        print( colored("[-] error occured: %s" % e, 'red') )
    if ex == 0 and r.status_code == 200:
        n = 0
        j = r.json()
        for item in j:
            parse = tldextract.extract( item['name_value'] )
            sub = item['name_value'].replace( '*.', '' )
            if sub != domain and not sub in t_subs:
                t_subs.append( sub )
                try:
                    ex = 0
                    data = socket.gethostbyname( sub )
                    if not data in t_ips:
                        n = n + 1
                        t_ips.append( data )
                except Exception as e:
                    ex = 1
        print( colored("[+] %d subdomains found, %d ips added" % (len(t_subs),n), 'green') )

PrivacyScore / privacyscanner / privacyscanner / scanmodules / openwpm.py View on Github

rule = line.split('$')[0]
            if is_acceptable_rule(rule):
                rules.append(rule)
        except Exception:
            logger.exception('Unexpected error while applying easylist rules.')

    abr = AdblockRules(rules)

    elapsed = timeit.default_timer() - start_time
    logger.info('Took %i secs to parse easylist rules' % elapsed)

    i = 0

    for url in third_party_requests:
        if abr.should_block(url):
            ext = tldextract.extract(url)
            trackers.append("{}.{}".format(ext.domain, ext.suffix))
        i = i + 1
        if i % 20 == 0:
            elapsed = timeit.default_timer() - start_time
            logger.info("Checked %i domains, %i secs elapsed..." % (i, elapsed))
    return list(set(trackers))

bit4woo / teemo / domainsites / CrtSearch.py View on Github

def get_related_domains(self):
        result = []
        main_of_domain = tldextract.extract(self.domain).domain

        reg_urls = re.compile('<a href="\?id=(.*?)">
        urls = reg_urls.findall(self.resp)


        reg_domains = re.compile('DNS:(.*?)<br>') #DNS:*.jdpay.com<br>

        for item in urls:
            url = "https://crt.sh/?id={0}".format(item)
            resp = req.get(url, proxies=self.proxy).content

            reg_common_name = re.compile("Subject:<br>(.*?)<br>")
            common_name = reg_common_name.findall(resp)
            if len(common_name) !=0:
                common_name = common_name[0].replace("&nbsp;", "").split("=")[-1]
                main_of_cn_domain = tldextract.extract(common_name).domain</a>

paulnaoki / DomainFinderSrcUniversal / DomainFinderSrc / Scrapers / LinkChecker.py View on Github

def get_root_domain(full_link: str, use_www=True) ->(False, str, str, str, str, str, str):
        """
        get the root domain from url
        :param full_link: e.g "http://www.google.com"
        :return:Tuple(True is the domain is root domain else Sub-domain, the real root domain, link to root domain,
        link to sub.domain, sub.domain, suffix of the domain, domain pure)
        """
        scheme = "http"
        if full_link.startswith("https"):
            scheme = "https"
            #scheme, target_domain, a, b, c = urlsplit(full_link)
            #scheme = urlsplit(full_link)[0]
        scheme += "://"
        #ext = tldextract.extract(target_domain)
        ext = tldextract.extract(full_link)
        root = ext.domain+"."+ext.suffix
        prefix = "www."
        if len(ext.domain) == 0 or len(ext.suffix) == 0:
            return False, "", "", "", "", "", ""
        elif ext.subdomain is None or len(ext.subdomain) == 0:
            if use_www and prefix not in full_link:
                return True, root, scheme+prefix+root, scheme+prefix+root, prefix+root, ext.suffix, ext.domain
            else:
                return True, root, scheme+root, scheme+root, root, ext.suffix, ext.domain
        else:
            sub_domain = ext.subdomain+"."+root
            if use_www:
                return False, root, scheme+prefix+root, scheme+sub_domain, sub_domain, ext.suffix, ext.domain
            else:
                return False, root, scheme+root, scheme+sub_domain, sub_domain, ext.suffix, ext.domain

waterbear-cloud / paco / src / paco / aws_api / acm / ACM.py View on Github

def get_domain_from_host(validation_dns_record):
            """ Given an FQDN, return the domain
                portion of a host
            """
            domain_tld_info = tldextract.extract(validation_dns_record)
            return "%s.%s" % (domain_tld_info.domain, domain_tld_info.suffix)

observerss / pygodaddy / pygodaddy / client.py View on Github

def _split_hostname(self, hostname):
        """ split hostname into prefix + domain """
        ext = tldextract.extract(hostname)
        prefix = ext.subdomain
        domain = ext.registered_domain
        if not prefix:
            prefix = '@'
        return prefix, domain

fportantier / habu / habu / cli / cmd_data_extract_domain.py View on Github

result = set()

    for m in match:

        candidate = m.group(0).lower()

        if '.' not in candidate:
            continue

        if not re.match('[a-z]+', candidate):
            continue

        if not re.match('[a-z0-9]+\.[a-z0-9]', candidate):
            continue

        tld = tldextract.extract(candidate)
        if tld.suffix:
            result.add(tld.domain + '.' + tld.suffix.rstrip('.'))

    return list(result)

PrivacyScore / privacyscanner / privacyscanner / scanmodules / serverleaks.py View on Github

def _concat_full(url, suffix):
    url_extract = extract(url)
    site = url_extract.domain + "." + url_extract.suffix
    if url_extract.subdomain != "":
        site = url_extract.subdomain + "." + site
    return site + suffix

alienwithin / Scripts-Sploits / giveWebHead.py View on Github

def gwhEngine(target, wordlist, method, redirects=False):
    error_codes_non_redir=[200,403]
    error_codes_redir= [200,301,302,403]
    with open(wordlist) as dirPerLine:
		for dir in dirPerLine:
			cleanDirName=str(dir.rstrip('\n'))
			fullURL=tldextract.extract(target)
			getHostname=fullURL.domain	
			resultFile=open(str(getHostname)+'.csv', 'a')
			badResults=open(str(getHostname)+'_ignored.csv', 'a')
			csvWritingObject = csv.writer(resultFile)
			BadResultObject=csv.writer(badResults)
			if method=="HEAD" and redirects=="False":
				gwhRequester=requests.head(target+cleanDirName,verify=False)
				gwhStatus=gwhRequester.status_code
				if gwhStatus in error_codes_non_redir:
					csvWritingObject.writerow( (target+cleanDirName, gwhStatus) )
					resultFile.close()
					print target+cleanDirName+" => "+ str(gwhStatus)		
			elif method=="HEAD" and redirects=="True":
				gwhRequester=requests.head(target+cleanDirName,verify=False)
				gwhStatus=gwhRequester.status_code
				if gwhStatus in error_codes_redir:

How to use the tldextract.extract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

tldextract

Package Health Score

Popular tldextract functions

Similar packages