Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.story.setMetadata('author',a.text)
logger.debug("Author: (%s)"%self.story.getMetadata('author'))
# Find the chapters:
chapters = soup.find('div', {'class' : 'part_list'})
if chapters != None:
chapters=chapters.findAll('a', href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+"/\d+#part_content$"))
self.story.setMetadata('numChapters',len(chapters))
for x in range(0,len(chapters)):
chapter=chapters[x]
churl='http://'+self.host+chapter['href']
self.chapterUrls.append((stripHTML(chapter),churl))
if x == 0:
pubdate = translit.translit(stripHTML(bs.BeautifulSoup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
if x == len(chapters)-1:
update = translit.translit(stripHTML(bs.BeautifulSoup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
else:
self.chapterUrls.append((self.story.getMetadata('title'),url))
self.story.setMetadata('numChapters',1)
pubdate=translit.translit(stripHTML(soup.find('div', {'class' : 'part_added'}).find('span')))
update=pubdate
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
if not ',' in pubdate:
pubdate=datetime.date.today().strftime(self.dateformat)
if not ',' in update:
update=datetime.date.today().strftime(self.dateformat)
pubdate=pubdate.split(',')[0]
update=update.split(',')[0]
fullmon = {"yanvarya":"01", "января":"01",
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
if "Please log in now" in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: You need to have access to the restricted section.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
for info in asoup.findAll('table', {'cellpadding' : '5'}):
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value).split(': ')[1].split(';')[0], self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value).split(': ')[1], self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)
def extractChapterUrlsAndMetadata(self):
url=self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# Now go hunting for all the meta data and the chapter list.
table = soup.find('td',{'width':'50%'})
## Title
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
logger.debug("Title: (%s)"%self.story.getMetadata('title'))
# Find authorid and URL from... author url.
a = table.find('a')
self.story.setMetadata('authorId',a.text) # Author's name is unique
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.text)
logger.debug("Author: (%s)"%self.story.getMetadata('author'))
self.story.addToList('characters',char)
if 'Status' in stripHTML(label):
if value.find('img', {'src' : 'img/incomplete.gif'}) == None:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in stripHTML(label):
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in stripHTML(label):
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
a = self._fetchUrl(self.story.getMetadata('authorUrl')+'&cat=stories')
for story in bs.BeautifulSoup(a).findAll('table', {'class' : 'storyinfo'}):
a = story.find('a', href=re.compile(r"review.php\?s\="+self.story.getMetadata('storyId')+'&act=view'))
if a != None:
for labels in story.findAll('tr'):
value = labels.findAll('td')[1]
label = labels.findAll('td')[0]
if 'genre' in stripHTML(label):
for genre in value.findAll('img'):
self.story.addToList('genre',genre['title'])
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'resizeableText'})
div.find('div', {'class' : 'storyTools'}).extract()
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)