How to use the pdftotext.PDF function in pdftotext

To help you get started, we’ve selected a few pdftotext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jalan / pdftotext / tests / test_pdf.py View on Github external
def test_read_portrait(self):
        pdf = pdftotext.PDF(get_file("portrait.pdf"))
        result = pdf[0]
        self.assertIn("a", result)
        self.assertIn("b", result)
        self.assertIn("c", result)
        self.assertIn("d", result)
github jalan / pdftotext / tests / test_sanity.py View on Github external
def test_pdf_read_all_zero_args(self):
        pdf = pdftotext.PDF(self.pdf_file)
        result = pdf.read_all()
        self.assertIn("", result)
github jalan / pdftotext / tests / test_pdf.py View on Github external
def test_pdf_read_invalid_page_number(self):
        pdf = pdftotext.PDF(get_file("blank.pdf"))
        with self.assertRaises(IndexError):
            pdf[100]
github jalan / pdftotext / tests / test_pdf.py View on Github external
def test_read_corrupt_page(self):
        with self.assertRaises((pdftotext.Error, IndexError)):
            pdf = pdftotext.PDF(get_file("corrupt_page.pdf"))
            pdf[0]
github jalan / pdftotext / tests / test_sanity.py View on Github external
def test_pdf_page_count(self):
        pdf = pdftotext.PDF(self.pdf_file)
        self.assertEqual(type(pdf.page_count), int)
github jalan / pdftotext / tests / test_pdf.py View on Github external
def test_raw_invalid_type(self):
        with self.assertRaises(TypeError):
            pdftotext.PDF(get_file("blank.pdf"), raw="")
github jalan / pdftotext / tests / test_pdf.py View on Github external
def test_list_invalid_element(self):
        pdf = pdftotext.PDF(get_file("two_page.pdf"))
        with self.assertRaises(IndexError):
            pdf[2]
github the-paperless-project / paperless / src / paperless_tesseract / parsers.py View on Github external
def get_text_from_pdf(pdf_file):

    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
            return ""

    return "\n".join(pdf)
github yedhink / covid19-kerala-api / scripts / extract-text-data.py View on Github external
def extract_text_data(latest_pdf):
    """
    params
    ------
    latest-pdf file - string
    it's very important to note that some of the pdf files
    are not convertable to text using pdftotext library
    """
    # Load the dhs data pdf
    with open(latest_pdf, "rb") as f:
        pdf = pdftotext.PDF(f)
    data = []
    # Iterate over only the required pages by providing range
    for page_num in range(len(pdf)):
        lines = ""
        for char in pdf[page_num]:
            if char == '\n':
                data.append(lines)
                lines = ""
            else:
                lines += char
    data = "\n".join(data)
    file_date = filters.extract_date(data)
    annex1_table = filters.extract_annex1(data)
    district_table = filters.extract_district(data)
    try:
        # convert to Standard ISO 8601 format
github femueller / python-n26 / n26 / cli.py View on Github external
def statement(statement_id: str):
    """ Show statement details """
    import pdftotext
    import io
    statement = API_CLIENT.get_statement(statement_id)
    pdf = pdftotext.PDF(io.BytesIO(statement))
    text = "\n\n".join(pdf)
    click.echo(text)

pdftotext

Simple PDF text extraction

MIT
Latest version published 3 years ago

Package Health Score

58 / 100
Full package analysis

Popular pdftotext functions