How to use the pdfplumber.open function in pdfplumber

To help you get started, we’ve selected a few pdfplumber examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jsvine / pdfplumber / tests / test-basics.py View on Github external
def test_password(self):
        path = os.path.join(HERE, "pdfs/password-example.pdf")
        pdf = pdfplumber.open(path, password = "test")
        assert(len(pdf.chars) > 0)
        pdf.close()
github naiveHobo / PostOCR / PostOCR / PostOCR.py View on Github external
def _load_file(self):
        self._clear()
        path = self.path
        filename = os.path.basename(path)
        if filename.split('.')[-1].lower() in ['jpg', 'png']:
            path = self._image_to_pdf(path)
        try:
            self.pdf = pdfplumber.open(path)
            self.total_pages = len(self.pdf.pages)
            self.pageidx = 1
            self.scale = 1.0
            self.rotate = 0
            self._update_page()
            self.master.title("PostOCR : {}".format(path))
        except WandException:
            res = messagebox.askokcancel("Error", "ImageMagick Policy Error! Should PostOCR try fixing the error?")
            if res == 1:
                self._fix_policy_error()
                messagebox.showinfo("Policy Fixed!", "ImageMagick Policy Error fixed! Restart PostOCR.")
            else:
                messagebox.showerror('Error', 'Could not open file!')
        except (IndexError, IOError, TypeError):
            messagebox.showerror('Error', 'Could not open file!')
github yooongchun / PDFParser / PDF_Table_single_table / PDFParser.py View on Github external
def search_page(path):
    print('load file:', os.path.basename(path))
    pdf = pdfplumber.open(path)
    pages = pdf.pages
    print('total page:', len(pages))
    print('searching pdf...')
    target = []
    st_flag = False
    for ind, page in enumerate(pages):
        # print('parse page:', ind + 1)
        text = page.extract_text()
        lines = re.split(r'\n+', text)
        for index, line in enumerate(lines):
            if not st_flag and re.match(r'\s*\d+[、.\s]+税金及附加\s*$', line):
                st_flag = True
                continue
            if st_flag and '合计' not in line:
                target.append(line)
            elif st_flag and '合计' in line:
github openoakland / OakCrime / showCrime / dailyIncid / management / commands / parse_UCR.py View on Github external
def parse_UCR_pdf(inf,rptDate,fdate,tdate,verbose=False):
	
	try:
		pdf = pdfplumber.open(inf)
		docinfo = pdf.metadata

		pdf1 = pdf.pages[0]	
		allTbl = pdf1.extract_tables()
	except Exception as e:
		print('parse_UCR_pdf: cant load',inf,e)
		return None

	# .extract_table returns a list of lists, with each inner list representing a row in the table. 
	tbl = allTbl[0]
	
	if verbose:
		print('parse_UCR_pdf: Table found %d x %d' % (len(tbl),len(tbl[0]) ))
	
	statTbl = {}
github jstray / deepform / tokenize_pdfs.py View on Github external
print('-----')


d = pd.read_csv('source/ftf-all-filings.tsv', sep='\t')

f = open('data/filings-tokens.csv', mode='w')
csv = csv.writer(f)
csv.writerow(['slug','page','x0','y0','x1','y1','token'])

for index, row in d.iterrows():
	slug = row['dc_slug']
	fname = 'pdfs/' + slug + '.pdf'
	print('Extracting ' +  fname)

	try:
		pdf = pdfplumber.open(fname)
		nopened += 1
	except Exception as e:
		print(e)
		nopenerror += 1
		continue

	try:		
		for p in range(len(pdf.pages)):
			for w in pdf.pages[p].extract_words():
				if '\0' not in w['text']:  # some tokens have nulls in them, which are not valid in a csv
					csv.writerow([slug, 
												p, 
												float(w['x0']), 
												float(w['top']), 
												float(w['x1']), 
												float(w['bottom']),
github jstray / deepform / train-unet.py View on Github external
validation_split=0.2,
		callbacks=[WandbCallback()])

# --- Log output PDF images ---

# convert a single row of document data (one token) to bbox format needed for drawing
def docrow_to_bbox(t):
	return [Decimal(t['x0']), Decimal(t['y0']), Decimal(t['x1']), Decimal(t['y1'])] 
	
cnt=0
for doc_idx,doc_rows in enumerate(input_docs(max_docs=read_docs)):
	slug = doc_rows[0]['slug']
	doc_rows = doc_rows[:max_doc_length]
	fname = 'pdfs/' + slug + '.pdf'
	try:
		pdf = pdfplumber.open(fname)
	except Exception as e:
		# If the file's not there, that's fine -- we use available PDFs to define what to see
		continue

	print('Rendering output for ' +  fname)

	# Get the correct answers: find the indices of the token(s) labelled 1
	target_idx = [idx for (idx,val) in enumerate(y[doc_idx]) if val==1]
			
	z = np.array([x[doc_idx]])
	predict = model.predict(z)
	predict = predict.squeeze(axis=0)
	
	# print our best guess for each dcoument
	answer_idx = np.argmax(y[doc_idx])
	print(f"Correct answer: {doc_rows[answer_idx]['token']} with score {y[doc_idx][answer_idx]}")