Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _setURL(self,url):
self.url = url
self.parsedUrl = urlparse(url)
self.host = self.parsedUrl.netloc
self.path = self.parsedUrl.path
self.story.setMetadata('storyUrl',self.url,condremoveentities=False)
self.story.setMetadata('sectionUrl',self._section_url(self.url),condremoveentities=False)
# No additional login is required, just check for adult
pagetitle_div = soup.find('div', id='pagetitle')
if pagetitle_div.a['href'].startswith('javascript:'):
if not(self.is_adult or self.getConfig('is_adult')):
raise exceptions.AdultCheckRequired(self.url)
url = ''.join([self.url, self.METADATA_URL_SUFFIX, self.AGE_CONSENT_URL_SUFFIX])
soup = self._customized_fetch_url(url)
pagetitle_div = soup.find('div', id='pagetitle')
self.story.setMetadata('title', stripHTML(pagetitle_div.a))
author_anchor = pagetitle_div.a.findNextSibling('a')
url = urlparse.urljoin(self.BASE_URL, author_anchor['href'])
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
self.story.setMetadata('author', stripHTML(author_anchor))
self.story.setMetadata('authorId', query_data['uid'][0])
self.story.setMetadata('authorUrl', url)
sort_div = soup.find('div', id='sort')
self.story.setMetadata('reviews', stripHTML(sort_div('a')[1]))
listbox_tag = soup.find('div', {'class': 'listbox'})
for span_tag in listbox_tag('span'):
key = span_tag.string
if key:
key = key.strip(' :')
try:
value = stripHTML(span_tag.nextSibling)
def form_url(parenturl,url):
url = url.strip() # ran across an image with a space in the
# src. Browser handled it, so we'd better, too.
if "//" in url or parenturl == None:
returl = url
else:
parsedUrl = urlparse(parenturl)
if url.startswith("/") :
returl = urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
else:
toppath=""
if parsedUrl.path.endswith("/"):
toppath = parsedUrl.path
else:
toppath = parsedUrl.path[:parsedUrl.path.rindex('/')]
returl = urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
toppath + '/' + url,
def _get_class_for(url):
## fix up leading protocol.
fixedurl = re.sub(r"(?i)^[htp]+(s?)[:/]+",r"http\1://",url.strip())
if fixedurl.startswith("//"):
fixedurl = "http:%s"%url
if not fixedurl.startswith("http"):
fixedurl = "http://%s"%url
## remove any trailing '#' locations, except for #post-12345 for
## XenForo
if not "#post-" in fixedurl:
fixedurl = re.sub(r"#.*$","",fixedurl)
parsedUrl = urlparse(fixedurl)
domain = parsedUrl.netloc.lower()
if( domain != parsedUrl.netloc ):
fixedurl = fixedurl.replace(parsedUrl.netloc,domain)
clslst = _get_classlist_fromlist(domain)
## assumes all adapters for a domain will have www or not have www
## but not mixed.
if not clslst and domain.startswith("www."):
domain = domain.replace("www.","")
#logger.debug("trying site:without www: "+domain)
clslst = _get_classlist_fromlist(domain)
fixedurl = re.sub(r"^http(s?)://www\.",r"http\1://",fixedurl)
if not clslst:
#logger.debug("trying site:www."+domain)
clslst =_get_classlist_fromlist("www."+domain)
fixedurl = re.sub(r"^http(s?)://",r"http\1://www.",fixedurl)
def _get_query_data(url):
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
return dict((key, data[0]) for key, data in query_data.items())
def _get_class_for(url):
## fix up leading protocol.
fixedurl = re.sub(r"(?i)^[htp]+(s?)[:/]+",r"http\1://",url.strip())
if fixedurl.startswith("//"):
fixedurl = "http:%s"%url
if not fixedurl.startswith("http"):
fixedurl = "http://%s"%url
## remove any trailing '#' locations, except for #post-12345 for
## XenForo
if not "#post-" in fixedurl:
fixedurl = re.sub(r"#.*$","",fixedurl)
parsedUrl = urlparse(fixedurl)
domain = parsedUrl.netloc.lower()
if( domain != parsedUrl.netloc ):
fixedurl = fixedurl.replace(parsedUrl.netloc,domain)
clslst = _get_classlist_fromlist(domain)
## assumes all adapters for a domain will have www or not have www
## but not mixed.
if not clslst and domain.startswith("www."):
domain = domain.replace("www.","")
#logger.debug("trying site:without www: "+domain)
clslst = _get_classlist_fromlist(domain)
fixedurl = re.sub(r"^http(s?)://www\.",r"http\1://",fixedurl)
if not clslst:
#logger.debug("trying site:www."+domain)
clslst =_get_classlist_fromlist("www."+domain)
fixedurl = re.sub(r"^http(s?)://",r"http\1://www.",fixedurl)