Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# TODO: better way to check if populate_metadata_from_tree did
# anything useful?
if paper.title in [None, ""]:
log.debug("# TODO: parse metadata from html using plugins here")
else:
populated_metadata = True
# can't try anything else if the url is still bad
if paper.pdf_url in [None, ""]:
continue
# Normalize the two urls. The url from the metadata on the page
# might be different from the url that was originally passed in,
# even though both urls might still refer to the same resource.
if is_same_url(url, paper.pdf_url):
# pdf_url is same as original url, no pdf found yet. This
# happens when the pdf url is correct, but the publisher is
# returning html instead. And the html happens to reference the
# url that was originally requested in the first place. Argh.
continue
log.debug("Switching activity to pdf_url {}".format(paper.pdf_url))
# paper pdf is stored at a different url. Attempt to fetch that
# url now. Only do this if pdf_url != url because otherwise
# this will be an endless loop.
for (url3, response2) in iterdownload(paper.pdf_url, paper=paper):
if is_response_pdf(response2):
log.debug("Got pdf on second-level page.")
pdfcontent = remove_watermarks(response.content)
paper.pdf = pdfcontent