Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
namespaces = [
("OAI-PMH", "http://www.openarchives.org/OAI/2.0/"),
("slim", "http://www.loc.gov/MARC21/slim"),
]
node = get_node(text=body, namespaces=namespaces)
record = node.xpath("//slim:record/slim:datafield/text()").extract_first()
assert node
assert record == "This is the record."
def build_item_rich(self, response):
"""Build the final HEPRecord with "rich" format XML."""
node = get_node(response.meta["record"])
article_type = response.meta.get("article_type")
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
record.add_dois(dois_values=response.meta.get("dois"))
record.add_xpath('abstract', './/Abstract')
record.add_xpath('title', './/ArticleTitle/Title')
record.add_xpath('subtitle', './/ArticleTitle/Subtitle')
record.add_value('authors', self._get_authors_rich(node))
record.add_xpath('free_keywords', './/Subject/Keyword/text()')
record.add_value('journal_title', response.meta['journal_title'])
record.add_xpath('journal_issue', './/Issue/text()')
record.add_xpath('journal_volume', './/Volume/text()')
fpage = node.xpath('.//FirstPage/text()').extract_first()
lpage = node.xpath('.//LastPage/text()').extract_first()
record.add_value('journal_fpage', fpage)
def get_root_node(arxiv_record):
"""Get a selector on the root ``article`` node of the record.
This can be overridden in case some preprocessing needs to be done on
the XML.
Args:
arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format.
Returns:
scrapy.selector.Selector: a selector on the root ``<article>``
node.
"""
if isinstance(arxiv_record, six.string_types):
root = get_node(arxiv_record)
else:
root = arxiv_record
root.remove_namespaces()
return root
</article>
def build_item(self, response):
"""Build the final record."""
node = get_node(response.meta["record"], self.namespaces)
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
record.add_value('authors', self.get_authors(node))
record.add_xpath('title',
"./slim:datafield[@tag='245']/slim:subfield[@code='a']/text()")
record.add_xpath('source',
"./slim:datafield[@tag='264']/slim:subfield[@code='b']/text()")
record.add_xpath('date_published',
"./slim:datafield[@tag='264']/slim:subfield[@code='c']/text()")
record.add_value('thesis_supervisor',
self.get_thesis_supervisors(node))
record.add_xpath(
'language', "./slim:datafield[@tag='041']/slim:subfield[@code='a']/text()")
record.add_value('urls', response.meta.get('urls'))
record.add_value('file_urls', response.meta.get("direct_links"))
record.add_value('abstract', response.meta.get("abstract"))
def build_item_jats(self, response):
"""Build the final HEPRecord with JATS-format XML ('jp')."""
node = get_node(response.meta["record"])
article_type = response.meta.get("article_type")
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
if article_type in ['correction',
'addendum']:
record.add_xpath('related_article_doi',
'.//related-article[@ext-link-type="doi"]/@href')
record.add_value('journal_doctype', article_type)
record.add_dois(dois_values=response.meta.get("dois"))
record.add_xpath('page_nr', ".//counts/page-count/@count")
record.add_xpath('abstract', './/abstract[1]')
record.add_xpath('title', './/article-title/text()')
record.add_xpath('subtitle', './/subtitle/text()')
record.add_value('authors', self._get_authors_jats(node))
record.add_xpath('collaborations', ".//contrib/collab/text()")
def get_root_node(jats_record):
"""Get a selector on the root ``article`` node of the record.
This can be overridden in case some preprocessing needs to be done on
the XML.
Args:
jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.
Returns:
scrapy.selector.Selector: a selector on the root ``<article>``
node.
"""
if isinstance(jats_record, six.string_types):
root = get_node(jats_record)
else:
root = jats_record
root.remove_namespaces()
return root
</article>