Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_author_with_email():
"""Test getting author email. JATS format."""
spider = edp_spider.EDPSpider()
body = """
<article xmlns:xlink="http://www.w3.org/1999/xlink">
SnameFnameFname.Sname@university.orga
</article>
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
parsed_item = spider.parse_node(response, node)
assert parsed_item
def package_jats(targzfile):
"""Extract tar.gz package with JATS XML file."""
spider = edp_spider.EDPSpider()
response = fake_response_from_string(text="", url="file://" + targzfile)
return next(spider.handle_package_file(response))
def record_jats(package_jats, scrape_pos_page_body):
"""Return results from the EDP spider with JATS format.
This is an open access journal, so we can scrape the splash page.
"""
spider = edp_spider.EDPSpider()
xml_path = package_jats.url.replace("file://", "")
fake_resp = fake_response_from_file(xml_path)
node = get_node(spider, "//article", fake_resp)[0]
request = spider.parse_node(fake_resp, node)
response = HtmlResponse(
url=request.url,
request=request,
body=scrape_pos_page_body,
**{'encoding': 'utf-8'}
)
parsed_item = request.callback(response)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def test_no_dois_jats():
"""Test parsing when no DOI in record. JATS format."""
spider = edp_spider.EDPSpider()
body = """
<article xmlns:xlink="http://www.w3.org/1999/xlink">
aa14485-102010A%26A...516A..97N
Dielectronic recombination of argon-like ions
</article>
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
parsed_item = spider.parse_node(response, node)
def test_addendum_jats():
"""Test parsing when article type is addendum. JATS format."""
spider = edp_spider.EDPSpider()
body = """
<article xmlns:xlink="http://www.w3.org/1999/xlink">
aa14485-102010A%26A...516A..97N
Dielectronic recombination of argon-like ions
</article>
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
def test_tarfile(tarbzfile, tmpdir):
"""Test untarring a tar.bz package with a test XML file.
Also test directory structure flattening.
"""
spider = edp_spider.EDPSpider()
xml_files = spider.untar_files(tarbzfile, six.text_type(tmpdir))
xml_files_flat = spider.untar_files(
tarbzfile, six.text_type(tmpdir), flatten=True)
assert len(xml_files) == 1
assert "aas/xml_rich/2000/01/ds1691.xml" in xml_files[0]
assert "ds1691.xml" in xml_files_flat[0]
assert "aas/xml_rich/2000/01" not in xml_files_flat[0]
def record_rich(package_rich):
"""Return results from the EDP spider with 'rich' format.
This is not an open access journal, so no splash scraping.
"""
spider = edp_spider.EDPSpider()
xml_path = package_rich.url.replace("file://", "")
fake_resp = fake_response_from_file(xml_path)
fake_resp.meta["rich"] = True
node = get_node(spider, "//EDPSArticle", fake_resp)[0]
parsed_item = spider.parse_node(fake_resp, node)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def test_handle_package_ftp(tarbzfile):
"""Test getting the target folder name for xml files."""
spider = edp_spider.EDPSpider()
response = fake_response_from_string(text=tarbzfile)
request = next(spider.handle_package_ftp(response))
assert isinstance(request, Request)
assert request.meta["source_folder"] == tarbzfile
def __init__(self, package_path=None, ftp_folder="incoming", ftp_netrc=None, *args, **kwargs):
"""Construct EDP spider.
:param package_path: path to local tar.gz or tar.bz2 package.
:param ftp_folder: path on remote ftp server.
:param ftp_netrc: path to netrc file.
"""
super(EDPSpider, self).__init__(*args, **kwargs)
self.ftp_folder = ftp_folder
self.ftp_host = "ftp.edpsciences.org"
self.ftp_netrc = ftp_netrc
self.target_folder = mkdtemp(prefix='EDP_', dir='/tmp/')
self.package_path = package_path
if not os.path.exists(self.target_folder):
os.makedirs(self.target_folder)