Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extract_by_title(self, element):
title_list = element.xpath('//title/text()')
if not title_list:
return ''
title = re.split(TITLE_SPLIT_CHAR_PATTERN, title_list[0])
if title:
return title[0]
else:
return ''
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
title_xpath = title_xpath or config.get('title', {}).get('xpath')
title = (self.extract_by_xpath(element, title_xpath)
or self.extract_by_htag_and_title(element)
or self.extract_by_title(element)
or self.extract_by_htag(element)
)
return title.strip()
def extractor(self, element: HtmlElement, author_xpath=''):
author_xpath = author_xpath or config.get('author', {}).get('xpath')
if author_xpath:
author = ''.join(element.xpath(author_xpath))
return author
text = ''.join(element.xpath('.//text()'))
for pattern in self.author_pattern:
author_obj = re.search(pattern, text)
if author_obj:
return author_obj.group(1)
return ''
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级
or self.extract_from_meta(element) # 第二优先级从 Meta 中提取
or self.extract_from_text(element)) # 最坏的情况从正文中提取
return publish_time
title = TitleExtractor().extract(element, title_xpath=title_xpath)
publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
element = pre_parse(element)
remove_noise_node(element, noise_node_list)
content = ContentExtractor().extract(element,
host=host,
with_body_html=with_body_html,
body_xpath=body_xpath)
result = {'title': title,
'author': author,
'publish_time': publish_time,
'content': content[0][1]['text'],
'images': content[0][1]['images']
}
if with_body_html or config.get('with_body_html', False):
result['body_html'] = content[0][1]['body_html']
return result
GNE 成为全球最好的新闻提取模块-今日头条
新华网:GNE 成为全球最好的新闻提取模块
同时,新闻的某个 标签中也会包含这个新闻标题。
因此,通过 h 标签与 title 的文字双向匹配,找到最适合作为新闻标题的字符串。
但是,需要考虑到 title 与 h 标签中的文字可能均含有特殊符号,因此,不能直接通过
判断 h 标签中的文字是否在 title 中来判断,这里需要中最长公共子串。
:param element:
:return:
"""
h_tag_texts_list = element.xpath('(//h1//text() | //h2//text() | //h3//text() | //h4//text() | //h5//text())')
title_text = ''.join(element.xpath('//title/text()'))
news_title = ''
for h_tag_text in h_tag_texts_list:
lcs = get_longest_common_sub_string(title_text, h_tag_text)
if len(lcs) > len(news_title):
news_title = lcs
return news_title
import json
import glob
from gne import GeneralNewsExtractor
if __name__ == '__main__':
html_list = glob.glob('tests/**/*.html', recursive=True)
for html_file in html_list:
with open(html_file, encoding='utf-8') as f:
html = f.read()
extractor = GeneralNewsExtractor()
result = extractor.extract(html,
host='https://www.xxx.com',
noise_node_list=['//div[@class="comment-list"]',
'//*[@style="display:none"]',
'//div[@class="statement"]'
])
print(f'>>>>>>>>>>>>>{html_file}>>>>>>>>>>>>>')
print(json.dumps(result, indent=2, ensure_ascii=False))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def __init__(self):
self.author_pattern = AUTHOR_PATTERN
def __init__(self):
self.time_pattern = DATETIME_PATTERN
def extract_from_meta(self, element: HtmlElement) -> str:
"""
一些很规范的新闻网站,会把新闻的发布时间放在 META 中,因此应该优先检查 META 数据
:param element: 网页源代码对应的Dom 树
:return: str
"""
for xpath in PUBLISH_TIME_META:
publish_time = element.xpath(xpath)
if publish_time:
return ''.join(publish_time)
return ''