Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_ocr_timeout(renderer, resources, outpdf):
out = check_ocrmypdf(
resources / 'skew.pdf',
outpdf,
'--tesseract-timeout',
'0',
'--pdf-renderer',
renderer,
)
pdfinfo = PdfInfo(out)
assert not pdfinfo[0].has_text
def test_redo_ocr(resources, outpdf):
in_ = resources / 'graph_ocred.pdf'
before = PdfInfo(in_, detailed_page_analysis=True)
out = outpdf
out = check_ocrmypdf(in_, out, '--redo-ocr')
after = PdfInfo(out, detailed_page_analysis=True)
assert before[0].has_text and after[0].has_text
assert (
before[0].get_textareas() != after[0].get_textareas()
), "Expected text to be different after re-OCR"
def test_jpeg(resources, outdir):
filename = resources / 'c02-22.pdf'
pdf = pdfinfo.PdfInfo(filename)
pdfimage = pdf[0].images[0]
assert pdfimage.enc == Encoding.jpeg
assert isclose(pdfimage.xres, 150)
def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
oversampled_pdf = check_ocrmypdf(
resources / 'skew.pdf',
outpdf,
'--oversample',
'350',
'-f',
'--pdf-renderer',
renderer,
env=spoof_tesseract_cache,
)
pdfinfo = PdfInfo(oversampled_pdf)
print(pdfinfo[0].xres)
assert abs(pdfinfo[0].xres - 350) < 1
def test_skip_pages_does_not_replicate(resources, basename, outdir):
infile = resources / basename
outpdf = outdir / basename
check_ocrmypdf(
infile,
outpdf,
'--pdf-renderer',
'sandwich',
'--force-ocr',
'--tesseract-timeout',
'0',
)
info_in = pdfinfo.PdfInfo(infile)
info = pdfinfo.PdfInfo(outpdf)
for page in info:
assert len(page.images) == 1, "skipped page was replicated"
for n in range(len(info_in)):
assert info[n].width_inches == info_in[n].width_inches
def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
out = check_ocrmypdf(
resources / 'jbig2.pdf',
outpdf,
'--output-type',
'pdf',
'--pdf-renderer',
'hocr',
env=spoof_tesseract_cache,
)
out_pageinfo = PdfInfo(out)
assert out_pageinfo[0].images[0].enc == Encoding.jbig2
'--pdfa-image-compression',
compression,
'-',
output_file,
]
p = run(
p_args,
stdout=PIPE,
stderr=PIPE,
stdin=input_stream,
universal_newlines=True,
env=spoof_tesseract_noop,
)
assert p.returncode == ExitCode.ok, p.stderr
pdfinfo = PdfInfo(output_file)
pdfimage = pdfinfo[0].images[0]
if compression == "jpeg":
assert pdfimage.enc == Encoding.jpeg
else:
if ghostscript.jpeg_passthrough_available():
# Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
# copied without transcoding - so report
if image.endswith('jpg'):
assert pdfimage.enc == Encoding.jpeg
else:
assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfimage.color == Colorspace.rgb, "Colorspace changed"
def test_no_contents(resources):
filename = resources / 'no_contents.pdf'
pdf = pdfinfo.PdfInfo(filename)
assert len(pdf[0].images) == 0
assert pdf[0].has_text == False
def test_oversized_page(resources):
pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
image = pdf[0].images[0]
assert image.width * image.xres > 200, "this is supposed to be oversized"
def test_userunit_qpdf_passes(spoof_tesseract_cache, poster, outpdf):
before = PdfInfo(poster)
check_ocrmypdf(poster, outpdf, '--output-type=pdf', env=spoof_tesseract_cache)
after = PdfInfo(outpdf)
assert isclose(before[0].width_inches, after[0].width_inches)