Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extract_html_header(self, doc):
"""Get metadata from the HTML head element."""
self.update('title', self.get_meta(doc, 'og:title'))
self.update('title', doc.findtext('.//title'))
self.update('summary', self.get_meta(doc, 'og:description'))
self.update('summary', self.get_meta(doc, 'description'))
self.update('author', self.get_meta(doc, 'author'))
self.update('author', self.get_meta(doc, 'og:site_name'))
self.update('published_at', self.get_meta(doc, 'artcile:published_time')) # noqa
self.update('modified_at', self.get_meta(doc, 'artcile:modified_time'))
for field in ['keywords', 'news_keywords']:
content = self.get_meta(doc, field)
if content is not None:
for keyword in content.split(','):
keyword = collapse_spaces(keyword)
if len(keyword):
self.result.emit_keyword(keyword)
def get_meta(self, doc, field):
for field_attr in ('property', 'name'):
for el in doc.findall('.//meta[@%s="%s"]' % (field_attr, field)):
content = collapse_spaces(el.get('content'))
if content is not None and len(content):
return content
def clean_name(cls, text):
if text is None or len(text) > MAX_LENGTH:
return
text = clean_entity_name(text)
text = collapse_spaces(text)
if not len(text) or len(text) < MIN_LENGTH:
return
return text
def apply(self, record):
value = six.text_type(self.template)
for repl, ref in self.replacements.items():
ref_value = record.get(ref) or ''
ref_value = six.text_type(ref_value)
value = value.replace(repl, ref_value)
return collapse_spaces(value).strip()
def get_meta(self, doc, field):
for field_attr in ('property', 'name'):
for el in doc.findall('.//meta[@%s="%s"]' % (field_attr, field)):
content = collapse_spaces(el.get('content'))
if content is not None and len(content):
return content
def name(self, name):
name = stringify(name)
if name is not None:
name = collapse_spaces(name)
self._name = name
def element_text(el):
if el is None:
return
text = stringify(el.text_content())
if text is not None:
return collapse_spaces(text)
def extract_html_text(self, doc):
"""Get all text from a DOM, also used by the XML parser."""
text = ' '.join(self.extract_html_elements(doc))
text = collapse_spaces(text)
if len(text):
return text
def parse_for_metadata(context, data, html):
meta = context.params.get('meta', {})
meta_date = context.params.get('meta_date', {})
meta_paths = meta
meta_paths.update(meta_date)
for key, xpaths in meta_paths.items():
for xpath in ensure_list(xpaths):
element = html.find(xpath)
if element is None:
continue
value = collapse_spaces(element.text_content())
if key in meta_date:
value = iso_date(value)
if value is not None:
data[key] = value
break
return data