Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_limited_pages(resources, outpdf, spoof_tesseract_cache):
multi = resources / 'multipage.pdf'
ocrmypdf.ocr(
multi,
outpdf,
pages='5-6',
optimize=0,
output_type='pdf',
tesseract_env=spoof_tesseract_cache,
)
pi = PdfInfo(outpdf)
assert not pi.pages[0].has_text
assert pi.pages[4].has_text
assert pi.pages[5].has_text
def test_masks(spoof_tesseract_noop, resources, outpdf):
assert (
ocrmypdf.ocr(
resources / 'masks.pdf', outpdf, tesseract_env=spoof_tesseract_noop
)
== ExitCode.ok
)
def test_no_glyphless_graft(resources, outdir):
pdf = pikepdf.open(resources / 'francais.pdf')
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
pdf.pages.extend(pdf_aspect.pages)
pdf.pages.extend(pdf_cmyk.pages)
pdf.save(outdir / 'test.pdf')
with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2):
ocrmypdf.ocr(
outdir / 'test.pdf', outdir / 'out.pdf', deskew=True, tesseract_timeout=0
)
def test_links(resources, outpdf):
ocrmypdf.ocr(
resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf'
)
pdf = pikepdf.open(outpdf)
p1 = pdf.pages[0]
p2 = pdf.pages[1]
assert p1.Annots[0].A.D[0].objgen == p2.objgen
assert p2.Annots[0].A.D[0].objgen == p1.objgen
def test_filter_from_api(resources, outdir):
ocrmypdf.ocr(
resources / 'crom.png',
outdir / 'out.pdf',
image_dpi=100,
sidecar=outdir / 'sidecar.txt',
filter_ocr_image=whiteout,
)
assert (outdir / 'sidecar.txt').read_text().strip() == ''