Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# =============================================================================
# Ural Google-related heuristic functions
# =============================================================================
#
# Collection of functions related to Google urls.
#
import re
from ural.utils import safe_urlsplit, unquote
from ural.patterns import QUERY_VALUE_IN_URL_TEMPLATE
AMP_QUERY_RE = re.compile(r'amp(_.+)=?', re.I)
AMP_SUFFIXES_RE = re.compile(r'(?:\.amp(?=\.html$)|\.amp/?$|(?<=/)amp/?$)', re.I)
URL_EXTRACT_RE = re.compile(QUERY_VALUE_IN_URL_TEMPLATE % r'url')
def is_amp_url(url):
splitted = safe_urlsplit(url)
if splitted.hostname.endswith('.ampproject.org'):
return True
if splitted.hostname.startswith('amp-'):
return True
if splitted.hostname.startswith('amp.'):
return True
if '/amp/' in splitted.path:
return True
# =============================================================================
# Ural Redirecion Inferrence Function
# =============================================================================
#
# A lot of urls contains an obvious hint that they will in fact trigger
# a redirection. This modules gathers routines aimed at discovering
# those redirections without even firing a HTTP request.
#
import re
from ural.patterns import QUERY_VALUE_IN_URL_TEMPLATE
from ural.utils import unquote, urljoin
OBVIOUS_REDIRECTS_RE = re.compile(QUERY_VALUE_IN_URL_TEMPLATE % r'(?:redirect(?:_to)?|url|[lu])', re.I)
REDIRECTION_DOMAINS_RE = re.compile(r'(?:\.ampproject\.org/[cv]/(?:s/)?|bc\.marfeelcache\.com/amp/|bc\.marfeel\.com/)', re.I)
def infer_redirection(url):
"""
Function returning the url that the given url will redirect to. This is done
by finding obvious hints in the GET parameters that the given url is in
fact a redirection.
Args:
url (string): Target url.
Returns:
string: Redirected url or the original url if nothing was found.
"""
urlunsplit,
safe_urlsplit,
SplitResult
)
NUMERIC_ID_RE = re.compile(r'[0-9]{8,}')
BASE_FACEBOOK_URL = 'https://www.facebook.com'
FACEBOOK_ID_RE = re.compile(r'^\d+$')
FACEBOOK_FULL_ID_RE = re.compile(r'^\d+_\d+$')
FACEBOOK_DOMAIN_RE = re.compile(r'(?:facebook\.[^.]+$|fb\.me$)', re.I)
FACEBOOK_URL_RE = re.compile(DOMAIN_TEMPLATE % r'(?:[^.]+\.)*(?:facebook\.[^.]+|fb\.me)', re.I)
MOBILE_REPLACE_RE = re.compile(r'^([^.]+\.)?facebook\.', re.I)
URL_EXTRACT_RE = re.compile(QUERY_VALUE_IN_URL_TEMPLATE % r'u')
def is_facebook_id(value):
return bool(re.search(FACEBOOK_ID_RE, value))
def is_facebook_full_id(value):
return bool(re.search(FACEBOOK_FULL_ID_RE, value))
def is_facebook_url(url):
"""
Function returning whether the given url is a valid Facebook url.
Args:
url (str): Url to test.