How to use the hepcrawl.utils.get_licenses function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / hepcrawl / spiders / pos_spider.py View on Github external
conference_paper_pdf_url,
    ):
        selector = Selector(
            text=xml_record,
            type="xml"
        )
        selector.remove_namespaces()
        record = HEPLoader(
            item=HEPRecord(),
            selector=selector
        )

        license_text = selector.xpath(
            './/metadata/pex-dc/rights/text()'
        ).extract_first()
        record.add_value('license', get_licenses(license_text=license_text))

        date, year = self._get_date(selector=selector)
        record.add_value('date_published', date)
        record.add_value('journal_year', year)

        identifier = selector.xpath(
            ".//metadata/pex-dc/identifier/text()"
        ).extract_first()
        record.add_value(
            'journal_title',
            self._get_journal_title(pos_ext_identifier=identifier),
        )
        record.add_value(
            'journal_volume',
            self._get_journal_volume(pos_ext_identifier=identifier),
        )
github inspirehep / hepcrawl / hepcrawl / spiders / elsevier_spider.py View on Github external
if doctype in {'correction', 'addendum'}:
            # NOTE: should test if this is working as intended.
            record.add_xpath(
                'related_article_doi', "//related-article[@ext-link-type='doi']/@href")

        xml_file = response.meta.get("xml_url")
        if xml_file:
            record.add_value(
                'documents',
                self.add_file(xml_file, "HIDDEN", "Fulltext"),
            )
            sd_url = self._get_sd_url(xml_file)
            if requests.head(sd_url).status_code == 200:  # Test if valid url
                record.add_value("urls", sd_url)

        license = get_licenses(
            license_url=node.xpath(
                ".//oa:userLicense/text()"
            ).extract_first(),
        )
        record.add_value('license', license)

        record.add_value('abstract', self.get_abstract(node))
        record.add_value('title', self.get_title(node))
        record.add_value('authors', self.get_authors(node))
        # record.add_xpath("urls", "//prism:url/text()")  # We don't want dx.doi urls
        record.add_value('free_keywords', self.get_keywords(node))
        info = response.meta.get("info")
        if info:
            record.add_value('date_published', info.get("date_published"))
            record.add_value('journal_title', info.get("journal_title"))
            record.add_value('journal_issue', info.get("issue"))
github inspirehep / hepcrawl / hepcrawl / spiders / hindawi_spider.py View on Github external
).extract()
        if journal_year:
            record.add_value('journal_year', int(journal_year[0]))

        record.add_xpath('journal_issue',
                         "./datafield[@tag='773']/subfield[@code='n']/text()")

        fpage, lpage = self.get_journal_pages(node)
        record.add_value('journal_fpage', fpage)
        record.add_value('journal_lpage', lpage)

        cr_statement, cr_year = self.get_copyright(node)
        record.add_value('copyright_statement', cr_statement)
        record.add_value('copyright_year', cr_year)

        license = get_licenses(
            license_url=node.xpath(
                "./datafield[@tag='540']/subfield[@code='u']/text()"
            ).extract_first(),
            license_text=node.xpath(
                "./datafield[@tag='540']/subfield[@code='a']/text()"
            ).extract_first(),
        )
        record.add_value('license', license)

        pdf_links, xml_links, splash_links = self.get_urls_in_record(node)
        record.add_value('urls', splash_links)
        record.add_value('file_urls', pdf_links)
        if xml_links:
            record.add_value(
                'documents',
                [
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
fpage = node.xpath('.//front//fpage/text()').extract()
        lpage = node.xpath('.//front//lpage/text()').extract()
        record.add_value('journal_fpage', fpage)
        record.add_value('journal_lpage', lpage)

        date_published = response.meta['date_published']
        record.add_value('journal_year', int(date_published[:4]))
        record.add_value('date_published', date_published)

        record.add_xpath('copyright_holder', './/copyright-holder/text()')
        record.add_xpath('copyright_year', './/copyright-year/text()')
        record.add_xpath('copyright_statement',
                         './/copyright-statement/text()')
        record.add_value('copyright_material', 'Article')

        license = get_licenses(
            license_url=node.xpath(
                './/license/license-p/ext-link/@href'
            ).extract_first()
        )
        record.add_value('license', license)

        record.add_value('collections', self._get_collections(
            node, article_type, response.meta['journal_title']))

        if "pdf_links" in response.meta:
            record.add_value(
                "documents",
                self._create_file(
                    get_first(response.meta["pdf_links"]),
                    "INSPIRE-PUBLIC",
                    "Fulltext"
github inspirehep / hepcrawl / hepcrawl / spiders / aps_spider.py View on Github external
record.add_value('journal_volume',
                         get_value(article, 'volume.number', default=''))
        # record.add_value('journal_artid', )

        published_date = article.get('date', '')
        record.add_value('journal_year', int(published_date[:4]))
        record.add_value('date_published', published_date)
        record.add_value('copyright_holder',
                         get_value(article, 'rights.copyrightHolders.name[0]', default=''))
        record.add_value('copyright_year',
                         str(get_value(article, 'rights.copyrightYear', default='')))
        record.add_value('copyright_statement',
                         get_value(article, 'rights.rightsStatement', default=''))
        record.add_value('copyright_material', 'publication')

        license = get_licenses(
            license_url=get_value(article, 'rights.licenses.url[0]', default='')
        )
        record.add_value('license', license)

        record.add_value('collections', ['HEP', 'Citeable', 'Published'])

        return ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )