Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
attributesText += processElement(child)
continue
attributesText += processElement(element)
elements = starter.nextGenerator()
for element in elements:
if isinstance(element, bs4.Tag):
if element == bound:
break
elif element.name == 'img':
rating = self._parseRatingFromImage(element)
if rating:
attributes['rating'] = rating
break
except AttributeError or TypeError:
raise ParsingError(u'Failed to locate and collect attributes.')
separators = u"\r\n :;."
freestandingText = u''
for line in attributesText.split(u'\n'):
if line.count(u':') != 1:
freestandingText += line
continue
key, value = line.split(u':', 1)
key = key.strip(separators).lower()
value = value.strip().strip(separators)
parsed = self._parseAttribute(key, value)
for parsedKey, parsedValue in parsed.items():
attributes[parsedKey] = parsedValue
freestandingText = freestandingText.strip()
if 'summary' not in attributes and freestandingText:
def _parseAuthor(self):
"""Locate and parse chapter author's information to a dictionary with author's `id' and `name'."""
try:
authorLink = self._document \
.find('span', {'class': 'glyphicon-user'}) \
.findNextSibling('a')
except AttributeError:
raise ParsingError(u'Failed to locate author link.')
match = re.search(u'(8-\d+)', authorLink['onclick'])
if not match:
raise ParsingError(u'Failed to extract author ID.')
authorId = match.group(0)
authorName = stripHTML(authorLink.text)
return {
'id': authorId,
'name': authorName
}
def _parseAuthor(self):
"""Locate and parse chapter author's information to a dictionary with author's `id' and `name'."""
try:
authorLink = self._document \
.find('span', {'class': 'glyphicon-user'}) \
.findNextSibling('a')
except AttributeError:
raise ParsingError(u'Failed to locate author link.')
match = re.search(u'(8-\d+)', authorLink['onclick'])
if not match:
raise ParsingError(u'Failed to extract author ID.')
authorId = match.group(0)
authorName = stripHTML(authorLink.text)
return {
'id': authorId,
'name': authorName
}
def _parseDate(self):
"""Locate and parse chapter date."""
try:
dateText = self._document.find('time', {'itemprop': 'dateCreated'}).text
dateText = dateText.replace(u'\n', u'')
dateText = dateText.strip()
except AttributeError:
raise ParsingError(u'Failed to locate date.')
# The site uses Europe/Moscow (MSK, UTC+0300) server time.
def todayInMoscow():
now = datetime.datetime.now() + datetime.timedelta(hours=3)
today = datetime.datetime(now.year, now.month, now.day)
return today
def parseDateText(text):
if text == u'Вчера':
return todayInMoscow() - datetime.timedelta(days=1)
elif text == u'Сегодня':
return todayInMoscow()
else:
return makeDate(text, '%d.%m.%Y, %H:%M')
date = parseDateText(dateText)
def __collectTextElements(self):
"""Return all elements containing parts of chapter text (which may be
<p>aragraphs, </p><div>isions or plain text nodes) under a single root."""
starter = self._document.find('div', {'itemprop': 'articleBody'})
if starter is None:
# FIXME: This will occur if the method is called more than once.
# The reason is elements appended to `root' are removed from the document.
# BS 4.4 implements cloning via `copy.copy()', but supporting it for BS 4.3
# would be error-prone (due to relying on BS internals) and is not needed.
if self._textElement:
logger.debug(u"You may not call this function more than once!")
raise ParsingError(u'Failed to locate text.')
collection = [starter]
for element in starter.childGenerator():
if element is None:
break
collection.append(element)
root = bs4.Tag(name='td')
for element in collection:
root.append(element)
if self._configuration['excludeEditorSignature']:
root = self._excludeEditorSignature(root)
return root
</div>