Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parsed_node(parsed_node):
"""Test call to parse_node when there is a direct link.
Result object should be the final HEPRecord.
"""
assert isinstance(parsed_node, hepcrawl.items.HEPRecord)
assert parsed_node["urls"][0]["value"] == "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf"
def build_conference_paper_item(
self,
xml_record,
conference_paper_url,
conference_paper_pdf_url,
):
selector = Selector(
text=xml_record,
type="xml"
)
selector.remove_namespaces()
record = HEPLoader(
item=HEPRecord(),
selector=selector
)
license_text = selector.xpath(
'.//metadata/pex-dc/rights/text()'
).extract_first()
record.add_value('license', get_licenses(license_text=license_text))
date, year = self._get_date(selector=selector)
record.add_value('date_published', date)
record.add_value('journal_year', year)
identifier = selector.xpath(
".//metadata/pex-dc/identifier/text()"
).extract_first()
record.add_value(
def build_item_jats(self, response):
"""Build the final HEPRecord with JATS-format XML ('jp')."""
node = get_node(response.meta["record"])
article_type = response.meta.get("article_type")
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
if article_type in ['correction',
'addendum']:
record.add_xpath('related_article_doi',
'.//related-article[@ext-link-type="doi"]/@href')
record.add_value('journal_doctype', article_type)
record.add_dois(dois_values=response.meta.get("dois"))
record.add_xpath('page_nr', ".//counts/page-count/@count")
record.add_xpath('abstract', './/abstract[1]')
record.add_xpath('title', './/article-title/text()')
record.add_xpath('subtitle', './/subtitle/text()')
record.add_value('authors', self._get_authors_jats(node))
record.add_xpath('collaborations', ".//contrib/collab/text()")
free_keywords, classification_numbers = self._get_keywords(node)
record.add_value('free_keywords', free_keywords)
def build_item(self, response):
"""Parse an Elsevier XML file into a HEP record."""
node = response.meta.get("node")
record = HEPLoader(
item=HEPRecord(), selector=node, response=response)
doctype = self.get_doctype(node)
self.logger.info("Doc type is %s", doctype)
if doctype in {'correction', 'addendum'}:
# NOTE: should test if this is working as intended.
record.add_xpath(
'related_article_doi', "//related-article[@ext-link-type='doi']/@href")
xml_file = response.meta.get("xml_url")
if xml_file:
record.add_value(
'documents',
self.add_file(xml_file, "HIDDEN", "Fulltext"),
)
sd_url = self._get_sd_url(xml_file)
if requests.head(sd_url).status_code == 200: # Test if valid url
record.add_value("urls", sd_url)
def build_item(self, response):
"""Build the final HEPRecord """
node = response.meta.get("node")
record = HEPLoader(
item=HEPRecord(), selector=node, response=response)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date"))
record.add_value('thesis', {'degree_type': "PhD"})
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("documents", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
yield parsed_item
def build_item(self, response):
"""Build the final record."""
jsonrecord = response.meta.get('jsonrecord')
record = HEPLoader(
item=HEPRecord(), selector=jsonrecord, response=response)
record.add_value('title', jsonrecord['title'])
record.add_value('abstract', jsonrecord['abstract'])
record.add_dois(dois_values=jsonrecord['doi'])
record.add_value('page_nr', jsonrecord['pages'])
record.add_value('authors', self.get_authors(jsonrecord['authors']))
record.add_value('file_urls', response.meta.get("direct_links"))
record.add_value('urls', jsonrecord['links'])
record.add_value('source', "Philpapers.org")
if not jsonrecord.get('year') == "forthcoming":
record.add_value('date_published', self.get_date(jsonrecord))
type_thesis = "thesis" in jsonrecord.get('pub_type').lower()
info_diss = "dissertation" in jsonrecord.get('pubInfo').lower()
if type_thesis or info_diss:
record.add_value('collections', ['THESIS'])
elif "journal" in jsonrecord.get('pub_type').lower():