Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
title_xpath = title_xpath or config.get('title', {}).get('xpath')
title = (self.extract_by_xpath(element, title_xpath)
or self.extract_by_htag_and_title(element)
or self.extract_by_title(element)
or self.extract_by_htag(element)
)
return title.strip()
def extractor(self, element: HtmlElement, author_xpath=''):
author_xpath = author_xpath or config.get('author', {}).get('xpath')
if author_xpath:
author = ''.join(element.xpath(author_xpath))
return author
text = ''.join(element.xpath('.//text()'))
for pattern in self.author_pattern:
author_obj = re.search(pattern, text)
if author_obj:
return author_obj.group(1)
return ''
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级
or self.extract_from_meta(element) # 第二优先级从 Meta 中提取
or self.extract_from_text(element)) # 最坏的情况从正文中提取
return publish_time
title = TitleExtractor().extract(element, title_xpath=title_xpath)
publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
element = pre_parse(element)
remove_noise_node(element, noise_node_list)
content = ContentExtractor().extract(element,
host=host,
with_body_html=with_body_html,
body_xpath=body_xpath)
result = {'title': title,
'author': author,
'publish_time': publish_time,
'content': content[0][1]['text'],
'images': content[0][1]['images']
}
if with_body_html or config.get('with_body_html', False):
result['body_html'] = content[0][1]['body_html']
return result