Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def parse_node(self, response, node):
"""Iterate all the record nodes in the XML and build the ``HEPRecord``."""
node.remove_namespaces()
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
record.add_value('authors', self.get_authors(node))
record.add_xpath('abstract', "./datafield[@tag='520']/subfield[@code='a']/text()")
record.add_xpath('title',
"./datafield[@tag='245']/subfield[@code='a']/text()")
record.add_xpath('date_published',
"./datafield[@tag='260']/subfield[@code='c']/text()")
record.add_xpath('page_nr',
"./datafield[@tag='300']/subfield[@code='a']/text()")
dois = node.xpath(
"./datafield[@tag='024'][subfield[@code='2'][contains(text(), 'DOI')]]"
"/subfield[@code='a']/text()"
).extract()
record.add_dois(dois_values=dois)
record.add_xpath('journal_title',
"./datafield[@tag='773']/subfield[@code='p']/text()")
def build_item(self, response):
"""Build the final record."""
jsonrecord = response.meta.get('jsonrecord')
record = HEPLoader(
item=HEPRecord(), selector=jsonrecord, response=response)
record.add_value('title', jsonrecord.get('primary_title'))
record.add_value('abstract', jsonrecord.get('abstract'))
record.add_value('free_keywords', jsonrecord.get('keyword'))
record.add_value('page_nr', response.meta.get("pages"))
record.add_value('authors', response.meta.get("authors"))
record.add_value('file_urls', response.meta.get("pdf_link"))
record.add_value('urls', jsonrecord.get('uri'))
record.add_value('date_published', response.meta.get("date"))
record.add_value('thesis', response.meta.get("thesis"))
record.add_value('collections', ['HEP', 'THESIS'])
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
def build_conference_proceedings_item(
self,
proceedings_page_html,
pos_id,
):
selector = Selector(
text=proceedings_page_html,
type='html',
)
selector.remove_namespaces()
record = HEPLoader(
item=HEPRecord(),
selector=selector
)
record.add_value('collections', ['proceedings'])
record.add_value(
'title',
self._get_proceedings_title(selector=selector),
)
record.add_value(
'subtitle',
self._get_proceedings_date_place(selector=selector),
)
record.add_value('journal_title', 'PoS')
record.add_value(
'journal_volume',
def _parse_json_on_failure(self, failure):
"""Parse a JSON article entry."""
original_response = failure.request.meta['original_response']
record = HEPLoader(item=HEPRecord(), response=original_response)
article = failure.request.meta['json_article']
doi = get_value(article, 'identifiers.doi', default='')
record.add_dois(dois_values=[doi])
if article.get('numPages', -1) > 0:
record.add_value('page_nr', str(article.get('numPages', '')))
record.add_value('abstract', get_value(article, 'abstract.value', default=''))
record.add_value('title', get_value(article, 'title.value', default=''))
# record.add_value('subtitle', '')
authors, collaborations = self._get_authors_and_collab(article)
record.add_value('authors', authors)
record.add_value('collaborations', collaborations)
# record.add_value('free_keywords', free_keywords)
def build_item(self, response):
"""Scrape MIT full metadata page and build the final HEPRecord item."""
node = response.selector
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
doc_type = node.xpath(
"//td[contains(text(), 'dc.description.degree')]/following-sibling::td[1]/text()").extract_first()
if doc_type and "ph" not in doc_type.lower():
return None
pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
if pdf_files:
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_xpath('date_published',
"//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")
record.add_value('thesis', self.get_thesis_info(node))
record.add_value('thesis_supervisor',
def build_item(self, response):
"""Build the final HEPRecord item."""
node = response.selector
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
pdf_files = response.meta.get("pdf_links")
if pdf_files:
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date_published"))
record.add_value('thesis', response.meta.get("thesis_info"))
record.add_value('thesis_supervisor', response.meta.get("supervisors"))
record.add_value('title', response.meta.get("titles"))
record.add_value('urls', response.meta.get("splash_link"))
record.add_value('abstract', response.meta.get("abstract"))
record.add_value('source', 'INFN')
record.add_value('collections', ['HEP', 'THESIS'])
def build_item(self, response):
"""Build the final record."""
jsonrecord = response.meta.get('jsonrecord')
record = HEPLoader(
item=HEPRecord(), selector=jsonrecord, response=response)
record.add_value('title', jsonrecord['title'])
record.add_value('abstract', jsonrecord['abstract'])
record.add_dois(dois_values=jsonrecord['doi'])
record.add_value('page_nr', jsonrecord['pages'])
record.add_value('authors', self.get_authors(jsonrecord['authors']))
record.add_value('file_urls', response.meta.get("direct_links"))
record.add_value('urls', jsonrecord['links'])
record.add_value('source', "Philpapers.org")
if not jsonrecord.get('year') == "forthcoming":
record.add_value('date_published', self.get_date(jsonrecord))
type_thesis = "thesis" in jsonrecord.get('pub_type').lower()
info_diss = "dissertation" in jsonrecord.get('pubInfo').lower()
if type_thesis or info_diss:
record.add_value('collections', ['THESIS'])
def build_item(self, response):
"""Build the final record."""
node = get_node(response.meta["record"], self.namespaces)
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
record.add_value('file_urls', response.meta.get("direct_link"))
record.add_value('urls', response.meta.get("urls"))
record.add_xpath('abstract', './/dc:description/text()')
title, subtitle = self.get_title(node)
if title:
record.add_value('title', title)
if subtitle:
record.add_value('subtitle', subtitle)
record.add_xpath('date_published', './/dc:date/text()')
record.add_xpath('source', './/base_dc:collname/text()')
record.add_value("authors", self.get_authors(node))
record.add_value('thesis', {'degree_type': 'PhD'})
record.add_value('collections', ['HEP', 'THESIS'])
parsed_item = ParsedItem(
record=record.load_item(),
def build_item(self, response):
"""Build the final record."""
node = get_node(response.meta["record"], self.namespaces)
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
record.add_value('authors', self.get_authors(node))
record.add_xpath('title',
"./slim:datafield[@tag='245']/slim:subfield[@code='a']/text()")
record.add_xpath('source',
"./slim:datafield[@tag='264']/slim:subfield[@code='b']/text()")
record.add_xpath('date_published',
"./slim:datafield[@tag='264']/slim:subfield[@code='c']/text()")
record.add_value('thesis_supervisor',
self.get_thesis_supervisors(node))
record.add_xpath(
'language', "./slim:datafield[@tag='041']/slim:subfield[@code='a']/text()")
record.add_value('urls', response.meta.get('urls'))
record.add_value('file_urls', response.meta.get("direct_links"))
record.add_value('abstract', response.meta.get("abstract"))
record.add_value('page_nr', response.meta.get("page_nr"))