Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
conference_paper_pdf_url,
):
selector = Selector(
text=xml_record,
type="xml"
)
selector.remove_namespaces()
record = HEPLoader(
item=HEPRecord(),
selector=selector
)
license_text = selector.xpath(
'.//metadata/pex-dc/rights/text()'
).extract_first()
record.add_value('license', get_licenses(license_text=license_text))
date, year = self._get_date(selector=selector)
record.add_value('date_published', date)
record.add_value('journal_year', year)
identifier = selector.xpath(
".//metadata/pex-dc/identifier/text()"
).extract_first()
record.add_value(
'journal_title',
self._get_journal_title(pos_ext_identifier=identifier),
)
record.add_value(
'journal_volume',
self._get_journal_volume(pos_ext_identifier=identifier),
)
if doctype in {'correction', 'addendum'}:
# NOTE: should test if this is working as intended.
record.add_xpath(
'related_article_doi', "//related-article[@ext-link-type='doi']/@href")
xml_file = response.meta.get("xml_url")
if xml_file:
record.add_value(
'documents',
self.add_file(xml_file, "HIDDEN", "Fulltext"),
)
sd_url = self._get_sd_url(xml_file)
if requests.head(sd_url).status_code == 200: # Test if valid url
record.add_value("urls", sd_url)
license = get_licenses(
license_url=node.xpath(
".//oa:userLicense/text()"
).extract_first(),
)
record.add_value('license', license)
record.add_value('abstract', self.get_abstract(node))
record.add_value('title', self.get_title(node))
record.add_value('authors', self.get_authors(node))
# record.add_xpath("urls", "//prism:url/text()") # We don't want dx.doi urls
record.add_value('free_keywords', self.get_keywords(node))
info = response.meta.get("info")
if info:
record.add_value('date_published', info.get("date_published"))
record.add_value('journal_title', info.get("journal_title"))
record.add_value('journal_issue', info.get("issue"))
).extract()
if journal_year:
record.add_value('journal_year', int(journal_year[0]))
record.add_xpath('journal_issue',
"./datafield[@tag='773']/subfield[@code='n']/text()")
fpage, lpage = self.get_journal_pages(node)
record.add_value('journal_fpage', fpage)
record.add_value('journal_lpage', lpage)
cr_statement, cr_year = self.get_copyright(node)
record.add_value('copyright_statement', cr_statement)
record.add_value('copyright_year', cr_year)
license = get_licenses(
license_url=node.xpath(
"./datafield[@tag='540']/subfield[@code='u']/text()"
).extract_first(),
license_text=node.xpath(
"./datafield[@tag='540']/subfield[@code='a']/text()"
).extract_first(),
)
record.add_value('license', license)
pdf_links, xml_links, splash_links = self.get_urls_in_record(node)
record.add_value('urls', splash_links)
record.add_value('file_urls', pdf_links)
if xml_links:
record.add_value(
'documents',
[
fpage = node.xpath('.//front//fpage/text()').extract()
lpage = node.xpath('.//front//lpage/text()').extract()
record.add_value('journal_fpage', fpage)
record.add_value('journal_lpage', lpage)
date_published = response.meta['date_published']
record.add_value('journal_year', int(date_published[:4]))
record.add_value('date_published', date_published)
record.add_xpath('copyright_holder', './/copyright-holder/text()')
record.add_xpath('copyright_year', './/copyright-year/text()')
record.add_xpath('copyright_statement',
'.//copyright-statement/text()')
record.add_value('copyright_material', 'Article')
license = get_licenses(
license_url=node.xpath(
'.//license/license-p/ext-link/@href'
).extract_first()
)
record.add_value('license', license)
record.add_value('collections', self._get_collections(
node, article_type, response.meta['journal_title']))
if "pdf_links" in response.meta:
record.add_value(
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
"Fulltext"
record.add_value('journal_volume',
get_value(article, 'volume.number', default=''))
# record.add_value('journal_artid', )
published_date = article.get('date', '')
record.add_value('journal_year', int(published_date[:4]))
record.add_value('date_published', published_date)
record.add_value('copyright_holder',
get_value(article, 'rights.copyrightHolders.name[0]', default=''))
record.add_value('copyright_year',
str(get_value(article, 'rights.copyrightYear', default='')))
record.add_value('copyright_statement',
get_value(article, 'rights.rightsStatement', default=''))
record.add_value('copyright_material', 'publication')
license = get_licenses(
license_url=get_value(article, 'rights.licenses.url[0]', default='')
)
record.add_value('license', license)
record.add_value('collections', ['HEP', 'Citeable', 'Published'])
return ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)