How to use ocrmypdf - 10 common examples

To help you get started, we’ve selected a few ocrmypdf examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

jbarlow83 / OCRmyPDF / tests / test_main.py View on Github

def test_ocr_timeout(renderer, resources, outpdf):
    out = check_ocrmypdf(
        resources / 'skew.pdf',
        outpdf,
        '--tesseract-timeout',
        '0',
        '--pdf-renderer',
        renderer,
    )
    pdfinfo = PdfInfo(out)
    assert not pdfinfo[0].has_text

jbarlow83 / OCRmyPDF / tests / test_main.py View on Github

def test_redo_ocr(resources, outpdf):
    in_ = resources / 'graph_ocred.pdf'
    before = PdfInfo(in_, detailed_page_analysis=True)
    out = outpdf
    out = check_ocrmypdf(in_, out, '--redo-ocr')
    after = PdfInfo(out, detailed_page_analysis=True)
    assert before[0].has_text and after[0].has_text
    assert (
        before[0].get_textareas() != after[0].get_textareas()
    ), "Expected text to be different after re-OCR"

jbarlow83 / OCRmyPDF / tests / test_pdfinfo.py View on Github

def test_jpeg(resources, outdir):
    filename = resources / 'c02-22.pdf'

    pdf = pdfinfo.PdfInfo(filename)

    pdfimage = pdf[0].images[0]
    assert pdfimage.enc == Encoding.jpeg
    assert isclose(pdfimage.xres, 150)

jbarlow83 / OCRmyPDF / tests / test_main.py View on Github

def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
    oversampled_pdf = check_ocrmypdf(
        resources / 'skew.pdf',
        outpdf,
        '--oversample',
        '350',
        '-f',
        '--pdf-renderer',
        renderer,
        env=spoof_tesseract_cache,
    )

    pdfinfo = PdfInfo(oversampled_pdf)

    print(pdfinfo[0].xres)
    assert abs(pdfinfo[0].xres - 350) &lt; 1

jbarlow83 / OCRmyPDF / tests / test_tess4.py View on Github

def test_skip_pages_does_not_replicate(resources, basename, outdir):
    infile = resources / basename
    outpdf = outdir / basename

    check_ocrmypdf(
        infile,
        outpdf,
        '--pdf-renderer',
        'sandwich',
        '--force-ocr',
        '--tesseract-timeout',
        '0',
    )

    info_in = pdfinfo.PdfInfo(infile)

    info = pdfinfo.PdfInfo(outpdf)
    for page in info:
        assert len(page.images) == 1, "skipped page was replicated"

    for n in range(len(info_in)):
        assert info[n].width_inches == info_in[n].width_inches

jbarlow83 / OCRmyPDF / tests / test_main.py View on Github

def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(
        resources / 'jbig2.pdf',
        outpdf,
        '--output-type',
        'pdf',
        '--pdf-renderer',
        'hocr',
        env=spoof_tesseract_cache,
    )
    out_pageinfo = PdfInfo(out)
    assert out_pageinfo[0].images[0].enc == Encoding.jbig2

jbarlow83 / OCRmyPDF / tests / test_main.py View on Github

'--pdfa-image-compression',
            compression,
            '-',
            output_file,
        ]
        p = run(
            p_args,
            stdout=PIPE,
            stderr=PIPE,
            stdin=input_stream,
            universal_newlines=True,
            env=spoof_tesseract_noop,
        )
        assert p.returncode == ExitCode.ok, p.stderr

    pdfinfo = PdfInfo(output_file)

    pdfimage = pdfinfo[0].images[0]

    if compression == "jpeg":
        assert pdfimage.enc == Encoding.jpeg
    else:
        if ghostscript.jpeg_passthrough_available():
            # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
            # copied without transcoding - so report
            if image.endswith('jpg'):
                assert pdfimage.enc == Encoding.jpeg
        else:
            assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)

    if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
        assert pdfimage.color == Colorspace.rgb, "Colorspace changed"

jbarlow83 / OCRmyPDF / tests / test_pdfinfo.py View on Github

def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'

    pdf = pdfinfo.PdfInfo(filename)
    assert len(pdf[0].images) == 0
    assert pdf[0].has_text == False

jbarlow83 / OCRmyPDF / tests / test_pdfinfo.py View on Github

def test_oversized_page(resources):
    pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
    image = pdf[0].images[0]
    assert image.width * image.xres > 200, "this is supposed to be oversized"

jbarlow83 / OCRmyPDF / tests / test_userunit.py View on Github

def test_userunit_qpdf_passes(spoof_tesseract_cache, poster, outpdf):
    before = PdfInfo(poster)
    check_ocrmypdf(poster, outpdf, '--output-type=pdf', env=spoof_tesseract_cache)

    after = PdfInfo(outpdf)
    assert isclose(before[0].width_inches, after[0].width_inches)

How to use ocrmypdf - 10 common examples

To help you get started, we’ve selected a few ocrmypdf examples, based on popular ways it is used in public projects.

ocrmypdf

Package Health Score

Popular ocrmypdf functions

Similar packages