Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_password(self):
path = os.path.join(HERE, "pdfs/password-example.pdf")
pdf = pdfplumber.open(path, password = "test")
assert(len(pdf.chars) > 0)
pdf.close()
def _load_file(self):
self._clear()
path = self.path
filename = os.path.basename(path)
if filename.split('.')[-1].lower() in ['jpg', 'png']:
path = self._image_to_pdf(path)
try:
self.pdf = pdfplumber.open(path)
self.total_pages = len(self.pdf.pages)
self.pageidx = 1
self.scale = 1.0
self.rotate = 0
self._update_page()
self.master.title("PostOCR : {}".format(path))
except WandException:
res = messagebox.askokcancel("Error", "ImageMagick Policy Error! Should PostOCR try fixing the error?")
if res == 1:
self._fix_policy_error()
messagebox.showinfo("Policy Fixed!", "ImageMagick Policy Error fixed! Restart PostOCR.")
else:
messagebox.showerror('Error', 'Could not open file!')
except (IndexError, IOError, TypeError):
messagebox.showerror('Error', 'Could not open file!')
def search_page(path):
print('load file:', os.path.basename(path))
pdf = pdfplumber.open(path)
pages = pdf.pages
print('total page:', len(pages))
print('searching pdf...')
target = []
st_flag = False
for ind, page in enumerate(pages):
# print('parse page:', ind + 1)
text = page.extract_text()
lines = re.split(r'\n+', text)
for index, line in enumerate(lines):
if not st_flag and re.match(r'\s*\d+[、.\s]+税金及附加\s*$', line):
st_flag = True
continue
if st_flag and '合计' not in line:
target.append(line)
elif st_flag and '合计' in line:
def parse_UCR_pdf(inf,rptDate,fdate,tdate,verbose=False):
try:
pdf = pdfplumber.open(inf)
docinfo = pdf.metadata
pdf1 = pdf.pages[0]
allTbl = pdf1.extract_tables()
except Exception as e:
print('parse_UCR_pdf: cant load',inf,e)
return None
# .extract_table returns a list of lists, with each inner list representing a row in the table.
tbl = allTbl[0]
if verbose:
print('parse_UCR_pdf: Table found %d x %d' % (len(tbl),len(tbl[0]) ))
statTbl = {}
print('-----')
d = pd.read_csv('source/ftf-all-filings.tsv', sep='\t')
f = open('data/filings-tokens.csv', mode='w')
csv = csv.writer(f)
csv.writerow(['slug','page','x0','y0','x1','y1','token'])
for index, row in d.iterrows():
slug = row['dc_slug']
fname = 'pdfs/' + slug + '.pdf'
print('Extracting ' + fname)
try:
pdf = pdfplumber.open(fname)
nopened += 1
except Exception as e:
print(e)
nopenerror += 1
continue
try:
for p in range(len(pdf.pages)):
for w in pdf.pages[p].extract_words():
if '\0' not in w['text']: # some tokens have nulls in them, which are not valid in a csv
csv.writerow([slug,
p,
float(w['x0']),
float(w['top']),
float(w['x1']),
float(w['bottom']),
validation_split=0.2,
callbacks=[WandbCallback()])
# --- Log output PDF images ---
# convert a single row of document data (one token) to bbox format needed for drawing
def docrow_to_bbox(t):
return [Decimal(t['x0']), Decimal(t['y0']), Decimal(t['x1']), Decimal(t['y1'])]
cnt=0
for doc_idx,doc_rows in enumerate(input_docs(max_docs=read_docs)):
slug = doc_rows[0]['slug']
doc_rows = doc_rows[:max_doc_length]
fname = 'pdfs/' + slug + '.pdf'
try:
pdf = pdfplumber.open(fname)
except Exception as e:
# If the file's not there, that's fine -- we use available PDFs to define what to see
continue
print('Rendering output for ' + fname)
# Get the correct answers: find the indices of the token(s) labelled 1
target_idx = [idx for (idx,val) in enumerate(y[doc_idx]) if val==1]
z = np.array([x[doc_idx]])
predict = model.predict(z)
predict = predict.squeeze(axis=0)
# print our best guess for each dcoument
answer_idx = np.argmax(y[doc_idx])
print(f"Correct answer: {doc_rows[answer_idx]['token']} with score {y[doc_idx][answer_idx]}")