Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""Get authors and return formatted dictionary.
Note that the `get_authors` in JATS extractor doesn't work here.
"""
authors = []
for contrib in node.xpath('.//contrib[@contrib-type="author"]'):
surname = contrib.xpath('name/surname/text()').extract()
given_names = contrib.xpath('name/given-names/text()').extract()
email = contrib.xpath('email/text()').extract_first()
affs_raw = contrib.xpath('aff')
affiliations = []
reffered_id = contrib.xpath('xref[@ref-type="aff"]/@rid').extract()
if reffered_id:
aff = node.xpath('.//aff[@id="{0}"]/addr-line/institution/text()'.format(
get_first(reffered_id))).extract()
if not aff:
aff = node.xpath('.//aff[@id="{0}"]/addr-line/text()'.format(
get_first(reffered_id))).extract()
affs_raw += aff
if affs_raw:
affs_raw_no_email = []
for aff_raw in affs_raw:
# Take e-mail from affiliation string
# FIXME: There is only one affiliation and email line per
# institution. The result is that every author will receive
# the email of the contact person as their own.
# There might also be a list of emails of all the authors.
if "e-mail" in aff_raw:
split_aff = aff_raw.split("e-mail")
affs_raw_no_email.append(split_aff[0].strip())
# FIXME: solution: strip the email but don't add it
# institution. The result is that every author will receive
# the email of the contact person as their own.
# There might also be a list of emails of all the authors.
if "e-mail" in aff_raw:
split_aff = aff_raw.split("e-mail")
affs_raw_no_email.append(split_aff[0].strip())
# FIXME: solution: strip the email but don't add it
# to 'email' key?
# if not email: # uncomment if you want to add it after all
# email = [split_aff[1].strip(": \n")]
if affs_raw_no_email:
affs_raw = affs_raw_no_email
affiliations = [{'value': aff} for aff in affs_raw]
authors.append({
'surname': get_first(surname, ""),
'given_names': get_first(given_names, ""),
'affiliations': affiliations,
'email': email,
})
return authors
return raw_data.split()[-1].strip("';")
else:
return ''
script = node.xpath(
"//script[contains(text(), 'SDM.pm.coverDate')]").extract_first()
if script:
script = script.split("\n")
raw_dois = [
i for i in script if "SDM.doi" in i or "SDM.pm.doi" in i]
dois = list(set([_strip_data(doi) for doi in raw_dois]))
cover_date = [i for i in script if "SDM.pm.coverDate" in i]
if cover_date:
year = dparser.parse(_strip_data(
get_first(cover_date, ''))).year
date_published = dparser.parse(
_strip_data(get_first(cover_date, ''))).date().isoformat()
if not script:
script = node.xpath(
"//script[contains(text(), 'coverDate')]/text()").extract_first()
if script:
var_sdm = [sc for sc in script.split("var") if "SDM" in sc][0]
cover_date_raw = [i for i in var_sdm.split(
"\n") if "coverDate" in i]
cover_date = cover_date_raw[0].split()[1].strip('",')
date = dparser.parse(cover_date)
date_published = date.date().isoformat()
year = date.year
if not dois:
raw_dois = node.xpath(
def format_date(day, month, year):
day = int(get_first(day, 1))
month = int(get_first(month, 1))
year = int(get_first(year, 1))
return datetime.date(day=day, month=month, year=year).isoformat()
reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract()
if reffered_id:
affiliations += node.xpath(".//aff[@id='{0}']".format(
get_first(reffered_id))
)
affiliations = [
{'value': get_first(aff.re('(.*)'))}
for aff in affiliations
if aff.re('(.*)')
]
authors.append({
'surname': get_first(surname, ""),
'given_names': get_first(given_names, ""),
'affiliations': affiliations,
'email': get_first(email, ""),
})
return authors
for author in raw_authors:
surname = author.xpath("./ce:surname/text()").extract_first()
given_names = author.xpath(
"./ce:given-name/text()").extract_first()
if surname and given_names:
fullname = u"{}, {}".format(surname, given_names)
authors.append(fullname)
elif surname:
authors.append(surname)
if len(authors) > 1:
f_authors = ", ".join(authors[:-1])
l_author = authors[-1]
author_string = u"{} & {}".format(f_authors, l_author)
else:
author_string = get_first(authors)
if ref.xpath(".//sb:et-al"):
author_string += " et al."
return author_string
def _get_authors(selector):
"""Get article authors."""
authors = []
creators = selector.xpath('.//metadata/pex-dc/creator')
for creator in creators:
auth_dict = {}
author = Selector(text=creator.extract())
auth_dict['raw_name'] = get_first(
author.xpath('.//name//text()').extract(),
default='',
)
for affiliation in author.xpath(
'.//affiliation//text()'
).extract():
if 'affiliations' in auth_dict:
auth_dict['affiliations'].append(
{
'value': affiliation
}
)
else:
auth_dict['affiliations'] = [
{
'value': affiliation
journal_year = node.xpath('.//IssueID/Year/text()').extract()
if journal_year:
record.add_value('journal_year', int(journal_year[0]))
record.add_value('date_published', response.meta['date_published'])
record.add_xpath('copyright_holder', './/Copyright/text()')
record.add_value('collections', self._get_collections(
node, article_type, response.meta['journal_title']))
if "pdf_links" in response.meta:
# NOTE: maybe this should be removed as the 'rich' format records
# are not open access.
record.add_value(
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
"Fulltext"
)
)
record.add_value("urls", response.meta.get("urls"))
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
return parsed_item
email = contrib.xpath("email/text()").extract()
affiliations = contrib.xpath('aff')
reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract()
if reffered_id:
affiliations += node.xpath(".//aff[@id='{0}']".format(
get_first(reffered_id))
)
affiliations = [
{'value': get_first(aff.re('(.*)'))}
for aff in affiliations
if aff.re('(.*)')
]
authors.append({
'surname': get_first(surname, ""),
'given_names': get_first(given_names, ""),
'affiliations': affiliations,
'email': get_first(email, ""),
})
return authors