Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_read_portrait(self):
pdf = pdftotext.PDF(get_file("portrait.pdf"))
result = pdf[0]
self.assertIn("a", result)
self.assertIn("b", result)
self.assertIn("c", result)
self.assertIn("d", result)
def test_pdf_read_all_zero_args(self):
pdf = pdftotext.PDF(self.pdf_file)
result = pdf.read_all()
self.assertIn("", result)
def test_pdf_read_invalid_page_number(self):
pdf = pdftotext.PDF(get_file("blank.pdf"))
with self.assertRaises(IndexError):
pdf[100]
def test_read_corrupt_page(self):
with self.assertRaises((pdftotext.Error, IndexError)):
pdf = pdftotext.PDF(get_file("corrupt_page.pdf"))
pdf[0]
def test_pdf_page_count(self):
pdf = pdftotext.PDF(self.pdf_file)
self.assertEqual(type(pdf.page_count), int)
def test_raw_invalid_type(self):
with self.assertRaises(TypeError):
pdftotext.PDF(get_file("blank.pdf"), raw="")
def test_list_invalid_element(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
with self.assertRaises(IndexError):
pdf[2]
def get_text_from_pdf(pdf_file):
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
return ""
return "\n".join(pdf)
def extract_text_data(latest_pdf):
"""
params
------
latest-pdf file - string
it's very important to note that some of the pdf files
are not convertable to text using pdftotext library
"""
# Load the dhs data pdf
with open(latest_pdf, "rb") as f:
pdf = pdftotext.PDF(f)
data = []
# Iterate over only the required pages by providing range
for page_num in range(len(pdf)):
lines = ""
for char in pdf[page_num]:
if char == '\n':
data.append(lines)
lines = ""
else:
lines += char
data = "\n".join(data)
file_date = filters.extract_date(data)
annex1_table = filters.extract_annex1(data)
district_table = filters.extract_district(data)
try:
# convert to Standard ISO 8601 format
def statement(statement_id: str):
""" Show statement details """
import pdftotext
import io
statement = API_CLIENT.get_statement(statement_id)
pdf = pdftotext.PDF(io.BytesIO(statement))
text = "\n\n".join(pdf)
click.echo(text)