How to use the hepcrawl.utils.get_first function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
"""Get authors and return formatted dictionary.

        Note that the `get_authors` in JATS extractor doesn't work here.
        """
        authors = []
        for contrib in node.xpath('.//contrib[@contrib-type="author"]'):
            surname = contrib.xpath('name/surname/text()').extract()
            given_names = contrib.xpath('name/given-names/text()').extract()
            email = contrib.xpath('email/text()').extract_first()

            affs_raw = contrib.xpath('aff')
            affiliations = []
            reffered_id = contrib.xpath('xref[@ref-type="aff"]/@rid').extract()
            if reffered_id:
                aff = node.xpath('.//aff[@id="{0}"]/addr-line/institution/text()'.format(
                    get_first(reffered_id))).extract()
                if not aff:
                    aff = node.xpath('.//aff[@id="{0}"]/addr-line/text()'.format(
                        get_first(reffered_id))).extract()
                affs_raw += aff
            if affs_raw:
                affs_raw_no_email = []
                for aff_raw in affs_raw:
                    # Take e-mail from affiliation string
                    # FIXME: There is only one affiliation and email line per
                    # institution. The result is that every author will receive
                    # the email of the contact person as their own.
                    # There might also be a list of emails of all the authors.
                    if "e-mail" in aff_raw:
                        split_aff = aff_raw.split("e-mail")
                        affs_raw_no_email.append(split_aff[0].strip())
                        # FIXME: solution: strip the email but don't add it
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
# institution. The result is that every author will receive
                    # the email of the contact person as their own.
                    # There might also be a list of emails of all the authors.
                    if "e-mail" in aff_raw:
                        split_aff = aff_raw.split("e-mail")
                        affs_raw_no_email.append(split_aff[0].strip())
                        # FIXME: solution: strip the email but don't add it
                        # to 'email' key?
                        # if not email:  # uncomment if you want to add it after all
                        #    email = [split_aff[1].strip(": \n")]
                if affs_raw_no_email:
                    affs_raw = affs_raw_no_email
                affiliations = [{'value': aff} for aff in affs_raw]
            authors.append({
                'surname': get_first(surname, ""),
                'given_names': get_first(given_names, ""),
                'affiliations': affiliations,
                'email': email,
            })

        return authors
github inspirehep / hepcrawl / hepcrawl / spiders / elsevier_spider.py View on Github external
return raw_data.split()[-1].strip("';")
            else:
                return ''

        script = node.xpath(
            "//script[contains(text(), 'SDM.pm.coverDate')]").extract_first()
        if script:
            script = script.split("\n")
            raw_dois = [
                i for i in script if "SDM.doi" in i or "SDM.pm.doi" in i]
            dois = list(set([_strip_data(doi) for doi in raw_dois]))

            cover_date = [i for i in script if "SDM.pm.coverDate" in i]
            if cover_date:
                year = dparser.parse(_strip_data(
                    get_first(cover_date, ''))).year
                date_published = dparser.parse(
                    _strip_data(get_first(cover_date, ''))).date().isoformat()
        if not script:
            script = node.xpath(
                "//script[contains(text(), 'coverDate')]/text()").extract_first()
        if script:
            var_sdm = [sc for sc in script.split("var") if "SDM" in sc][0]
            cover_date_raw = [i for i in var_sdm.split(
                "\n") if "coverDate" in i]
            cover_date = cover_date_raw[0].split()[1].strip('",')
            date = dparser.parse(cover_date)
            date_published = date.date().isoformat()
            year = date.year

        if not dois:
            raw_dois = node.xpath(
github inspirehep / hepcrawl / hepcrawl / extractors / jats.py View on Github external
def format_date(day, month, year):
            day = int(get_first(day, 1))
            month = int(get_first(month, 1))
            year = int(get_first(year, 1))
            return datetime.date(day=day, month=month, year=year).isoformat()
github inspirehep / hepcrawl / hepcrawl / extractors / jats.py View on Github external
reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract()
            if reffered_id:
                affiliations += node.xpath(".//aff[@id='{0}']".format(
                    get_first(reffered_id))
                )
            affiliations = [
                {'value': get_first(aff.re('(.*)'))}
                for aff in affiliations
                if aff.re('(.*)')
            ]

            authors.append({
                'surname': get_first(surname, ""),
                'given_names': get_first(given_names, ""),
                'affiliations': affiliations,
                'email': get_first(email, ""),
            })
        return authors
github inspirehep / hepcrawl / hepcrawl / spiders / elsevier_spider.py View on Github external
for author in raw_authors:
            surname = author.xpath("./ce:surname/text()").extract_first()
            given_names = author.xpath(
                "./ce:given-name/text()").extract_first()
            if surname and given_names:
                fullname = u"{}, {}".format(surname, given_names)
                authors.append(fullname)
            elif surname:
                authors.append(surname)

        if len(authors) > 1:
            f_authors = ", ".join(authors[:-1])
            l_author = authors[-1]
            author_string = u"{} & {}".format(f_authors, l_author)
        else:
            author_string = get_first(authors)
        if ref.xpath(".//sb:et-al"):
            author_string += " et al."

        return author_string
github inspirehep / hepcrawl / hepcrawl / spiders / pos_spider.py View on Github external
def _get_authors(selector):
        """Get article authors."""
        authors = []
        creators = selector.xpath('.//metadata/pex-dc/creator')
        for creator in creators:
            auth_dict = {}
            author = Selector(text=creator.extract())
            auth_dict['raw_name'] = get_first(
                author.xpath('.//name//text()').extract(),
                default='',
            )
            for affiliation in author.xpath(
                './/affiliation//text()'
            ).extract():
                if 'affiliations' in auth_dict:
                    auth_dict['affiliations'].append(
                        {
                            'value': affiliation
                        }
                    )
                else:
                    auth_dict['affiliations'] = [
                        {
                            'value': affiliation
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
journal_year = node.xpath('.//IssueID/Year/text()').extract()
        if journal_year:
            record.add_value('journal_year', int(journal_year[0]))
        record.add_value('date_published', response.meta['date_published'])

        record.add_xpath('copyright_holder', './/Copyright/text()')
        record.add_value('collections', self._get_collections(
            node, article_type, response.meta['journal_title']))

        if "pdf_links" in response.meta:
            # NOTE: maybe this should be removed as the 'rich' format records
            # are not open access.
            record.add_value(
                "documents",
                self._create_file(
                    get_first(response.meta["pdf_links"]),
                    "INSPIRE-PUBLIC",
                    "Fulltext"
                )
            )
        record.add_value("urls", response.meta.get("urls"))

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        return parsed_item
github inspirehep / hepcrawl / hepcrawl / extractors / jats.py View on Github external
email = contrib.xpath("email/text()").extract()
            affiliations = contrib.xpath('aff')
            reffered_id = contrib.xpath("xref[@ref-type='aff']/@rid").extract()
            if reffered_id:
                affiliations += node.xpath(".//aff[@id='{0}']".format(
                    get_first(reffered_id))
                )
            affiliations = [
                {'value': get_first(aff.re('(.*)'))}
                for aff in affiliations
                if aff.re('(.*)')
            ]

            authors.append({
                'surname': get_first(surname, ""),
                'given_names': get_first(given_names, ""),
                'affiliations': affiliations,
                'email': get_first(email, ""),
            })
        return authors