Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
base_url="",
hostname="",
url_schema=None,
ftp_params=None,
url=""
):
app = Flask('hepcrawl')
app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {}))
file_name = url.split('/')[-1]
with app.app_context():
parsed_items = []
for xml_record in marcxml_records:
try:
record = marcxml2record(xml_record)
parsed_item = ParsedItem(record=record, record_format='hep')
parsed_item.ftp_params = ftp_params
parsed_item.file_name = file_name
files_to_download = [
self._get_full_uri(
current_url=document['url'],
base_url=base_url,
schema=url_schema,
hostname=hostname,
)
for document in parsed_item.record.get('documents', [])
if self._has_to_be_downloaded(document['url'])
]
parsed_item.file_urls = files_to_download
self.logger.info('Got the following attached documents to download: %s'% files_to_download)
record = JatsParser(node, source='WSP')
if record.article_type not in self.allowed_article_types:
# Filter out non-interesting article types
self.logger.info(
(
"Ignoring record because article type is not in %s, "
"record:\n%s"
),
self.allowed_article_types,
record,
)
return
self.logger.info('Parsing record:\n%s', record)
parsed_item = ParsedItem(
record=record.parse(),
record_format='hep',
)
return parsed_item
def build_item(self, response):
"""Build the final ``HEPRecord``."""
node = response.meta.get("node")
record = HEPLoader(
item=HEPRecord(), selector=node, response=response)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date"))
record.add_value('thesis', {'degree_type': "PhD"})
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("documents", response.meta.get("documents"))
record.add_value('collections', ['HEP', 'THESIS'])
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
yield parsed_item
):
app = Flask('hepcrawl')
app.config.update(
settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
try:
record = cds_marcxml2record(marcxml_record)
return ParsedItem(
record=record,
record_format='hep'
)
except Exception as e:
tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
return ParsedItem.from_exception(
record_format='hep',
exception=repr(e),
traceback=tb,
source_data=marcxml_record
)
jsonrecord = response.meta.get('jsonrecord')
record = HEPLoader(
item=HEPRecord(), selector=jsonrecord, response=response)
record.add_value('title', jsonrecord.get('primary_title'))
record.add_value('abstract', jsonrecord.get('abstract'))
record.add_value('free_keywords', jsonrecord.get('keyword'))
record.add_value('page_nr', response.meta.get("pages"))
record.add_value('authors', response.meta.get("authors"))
record.add_value('file_urls', response.meta.get("pdf_link"))
record.add_value('urls', jsonrecord.get('uri'))
record.add_value('date_published', response.meta.get("date"))
record.add_value('thesis', response.meta.get("thesis"))
record.add_value('collections', ['HEP', 'THESIS'])
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
return parsed_item
if "pdf_links" in response.meta:
record.add_value(
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
"Fulltext"
)
)
record.add_value("urls", response.meta.get("urls"))
references = self._get_references(node)
record.add_value("references", references)
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
return parsed_item
def _parsed_item_from_marcxml(
marcxml_record,
settings
):
app = Flask('hepcrawl')
app.config.update(
settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
try:
record = cds_marcxml2record(marcxml_record)
return ParsedItem(
record=record,
record_format='hep'
)
except Exception as e:
tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
return ParsedItem.from_exception(
record_format='hep',
exception=repr(e),
traceback=tb,
source_data=marcxml_record
)
node, article_type, response.meta['journal_title']))
if "pdf_links" in response.meta:
# NOTE: maybe this should be removed as the 'rich' format records
# are not open access.
record.add_value(
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
"Fulltext"
)
)
record.add_value("urls", response.meta.get("urls"))
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
return parsed_item
def build_item(self, response):
"""Build the final HEPRecord """
node = response.meta.get("node")
record = HEPLoader(
item=HEPRecord(), selector=node, response=response)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date"))
record.add_value('thesis', {'degree_type': "PhD"})
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("documents", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)
yield parsed_item
def parse(self, response):
"""Parse a JSON article entry."""
try:
if response.status == 404:
raise ValueError("DOI not found on Crossref")
parser = CrossrefParser(json.loads(response.body))
return ParsedItem(
record=parser.parse(),
record_format='hep',
)
except Exception as e:
tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
error_parsed_item = ParsedItem.from_exception(
record_format='hep',
exception=repr(e),
traceback=tb,
source_data=response.body,
file_name=self.url
)
return error_parsed_item