How to use the hepcrawl.utils.ParsedItem function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / hepcrawl / spiders / desy_spider.py View on Github external
base_url="",
            hostname="",
            url_schema=None,
            ftp_params=None,
            url=""
    ):
        app = Flask('hepcrawl')
        app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {}))
        file_name = url.split('/')[-1]

        with app.app_context():
            parsed_items = []
            for xml_record in marcxml_records:
                try:
                    record = marcxml2record(xml_record)
                    parsed_item = ParsedItem(record=record, record_format='hep')
                    parsed_item.ftp_params = ftp_params
                    parsed_item.file_name = file_name

                    files_to_download = [
                        self._get_full_uri(
                            current_url=document['url'],
                            base_url=base_url,
                            schema=url_schema,
                            hostname=hostname,
                        )
                        for document in parsed_item.record.get('documents', [])
                        if self._has_to_be_downloaded(document['url'])
                    ]
                    parsed_item.file_urls = files_to_download

                    self.logger.info('Got the following attached documents to download: %s'% files_to_download)
github inspirehep / hepcrawl / hepcrawl / spiders / wsp_spider.py View on Github external
record = JatsParser(node, source='WSP')
        if record.article_type not in self.allowed_article_types:
            # Filter out non-interesting article types
            self.logger.info(
                (
                    "Ignoring record because article type is not in %s, "
                    "record:\n%s"
                ),
                self.allowed_article_types,
                record,
            )
            return

        self.logger.info('Parsing record:\n%s', record)
        parsed_item = ParsedItem(
            record=record.parse(),
            record_format='hep',
        )

        return parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / t2k_spider.py View on Github external
def build_item(self, response):
        """Build the final ``HEPRecord``."""
        node = response.meta.get("node")
        record = HEPLoader(
            item=HEPRecord(), selector=node, response=response)

        record.add_value('authors', response.meta.get("authors"))
        record.add_value('date_published', response.meta.get("date"))
        record.add_value('thesis', {'degree_type': "PhD"})
        record.add_value('title', response.meta.get("title"))
        record.add_value('urls', response.meta.get("urls"))
        record.add_value("abstract", response.meta.get("abstract"))
        record.add_value("documents", response.meta.get("documents"))
        record.add_value('collections', ['HEP', 'THESIS'])

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        yield parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / cds_spider.py View on Github external
):
    app = Flask('hepcrawl')
    app.config.update(
        settings.getdict('MARC_TO_HEP_SETTINGS', {})
    )

    with app.app_context():
        try:
            record = cds_marcxml2record(marcxml_record) 
            return ParsedItem(
                record=record,
                record_format='hep'
            )
        except Exception as e:
            tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
            return ParsedItem.from_exception(
                record_format='hep',
                exception=repr(e),
                traceback=tb,
                source_data=marcxml_record
            )
github inspirehep / hepcrawl / hepcrawl / spiders / brown_spider.py View on Github external
jsonrecord = response.meta.get('jsonrecord')
        record = HEPLoader(
            item=HEPRecord(), selector=jsonrecord, response=response)

        record.add_value('title', jsonrecord.get('primary_title'))
        record.add_value('abstract', jsonrecord.get('abstract'))
        record.add_value('free_keywords', jsonrecord.get('keyword'))
        record.add_value('page_nr', response.meta.get("pages"))
        record.add_value('authors', response.meta.get("authors"))
        record.add_value('file_urls', response.meta.get("pdf_link"))
        record.add_value('urls', jsonrecord.get('uri'))
        record.add_value('date_published', response.meta.get("date"))
        record.add_value('thesis', response.meta.get("thesis"))
        record.add_value('collections', ['HEP', 'THESIS'])

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        return parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
if "pdf_links" in response.meta:
            record.add_value(
                "documents",
                self._create_file(
                    get_first(response.meta["pdf_links"]),
                    "INSPIRE-PUBLIC",
                    "Fulltext"
                )
            )
        record.add_value("urls", response.meta.get("urls"))

        references = self._get_references(node)
        record.add_value("references", references)

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        return parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / cds_spider.py View on Github external
def _parsed_item_from_marcxml(
        marcxml_record,
        settings
):
    app = Flask('hepcrawl')
    app.config.update(
        settings.getdict('MARC_TO_HEP_SETTINGS', {})
    )

    with app.app_context():
        try:
            record = cds_marcxml2record(marcxml_record) 
            return ParsedItem(
                record=record,
                record_format='hep'
            )
        except Exception as e:
            tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
            return ParsedItem.from_exception(
                record_format='hep',
                exception=repr(e),
                traceback=tb,
                source_data=marcxml_record
            )
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
node, article_type, response.meta['journal_title']))

        if "pdf_links" in response.meta:
            # NOTE: maybe this should be removed as the 'rich' format records
            # are not open access.
            record.add_value(
                "documents",
                self._create_file(
                    get_first(response.meta["pdf_links"]),
                    "INSPIRE-PUBLIC",
                    "Fulltext"
                )
            )
        record.add_value("urls", response.meta.get("urls"))

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        return parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / magic_spider.py View on Github external
def build_item(self, response):
        """Build the final HEPRecord """
        node = response.meta.get("node")
        record = HEPLoader(
            item=HEPRecord(), selector=node, response=response)

        record.add_value('authors', response.meta.get("authors"))
        record.add_value('date_published', response.meta.get("date"))
        record.add_value('thesis', {'degree_type': "PhD"})
        record.add_value('title', response.meta.get("title"))
        record.add_value('urls', response.meta.get("urls"))
        record.add_value("abstract", response.meta.get("abstract"))
        record.add_value("documents", response.meta.get("files"))
        record.add_value('collections', ['HEP', 'THESIS'])

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        yield parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / crossref_spider.py View on Github external
def parse(self, response):
        """Parse a JSON article entry."""
        try:
            if response.status == 404:
               raise ValueError("DOI not found on Crossref")

            parser = CrossrefParser(json.loads(response.body))

            return ParsedItem(
                record=parser.parse(),
                record_format='hep',
            )
        except Exception as e:
            tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
            error_parsed_item = ParsedItem.from_exception(
                record_format='hep',
                exception=repr(e),
                traceback=tb,
                source_data=response.body,
                file_name=self.url
            )
            return error_parsed_item