Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_invalid_stream_object():
with pytest.raises(TypeError):
parse_content_stream(Dictionary({"/Hi": 3}))
def test_text_filter(resources, outdir):
input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'
# Ensure the test PDF has detect we can find
proc = run(
['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8'
)
assert proc.stdout.strip() != '', "Need input test file that contains text"
pdf = Pdf.open(input_pdf)
page = pdf.pages[0]
keep = []
for operands, command in parse_content_stream(
page, """TJ Tj ' " BT ET Td TD Tm T* Tc Tw Tz TL Tf Tr Ts"""
):
if command == Operator('Tj'):
print("skipping Tj")
continue
keep.append((operands, command))
new_stream = Stream(pdf, keep)
print(new_stream.read_bytes()) # pylint: disable=no-member
page['/Contents'] = new_stream
page['/Rotate'] = 90
pdf.save(outdir / 'notext.pdf', True)
proc = run(
['pdftotext', str(outdir / 'notext.pdf'), '-'],
def inline(resources):
pdf = Pdf.open(resources / 'image-mono-inline.pdf')
for operands, _command in parse_content_stream(pdf.pages[0]):
if operands and isinstance(operands[0], PdfInlineImage):
return operands[0], pdf
undefined in the spec, but we just pretend nothing happened and leave the
CTM unchanged.
"""
stack = []
ctm = PdfMatrix(initial_shorthand)
xobject_settings = []
inline_images = []
found_vector = False
vector_ops = set('S s f F f* B B* b b*'.split())
image_ops = set('BI ID EI q Q Do cm'.split())
operator_whitelist = ' '.join(vector_ops | image_ops)
for n, graphobj in enumerate(
_normalize_stack(
pikepdf.parse_content_stream(contentstream, operator_whitelist)
)
):
operands, operator = graphobj
if operator == 'q':
stack.append(ctm)
if len(stack) > 32: # See docstring
if len(stack) > 128:
raise RuntimeError(
"PDF graphics stack overflowed hard limit, operator %i" % n
)
warn("PDF graphics stack overflowed spec limit")
elif operator == 'Q':
try:
ctm = stack.pop()
except IndexError:
# Keeping the ctm the same seems to be the only sensible thing
def strip_invisible_text(pdf, page):
stream = []
in_text_obj = False
render_mode = 0
text_objects = []
page.page_contents_coalesce()
for operands, operator in pikepdf.parse_content_stream(page, ''):
if not in_text_obj:
if operator == pikepdf.Operator('BT'):
in_text_obj = True
render_mode = 0
text_objects.append((operands, operator))
else:
stream.append((operands, operator))
else:
if operator == pikepdf.Operator('Tr'):
render_mode = operands[0]
text_objects.append((operands, operator))
if operator == pikepdf.Operator('ET'):
in_text_obj = False
if render_mode != 3:
stream.extend(text_objects)
text_objects.clear()