Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_page_contents_add(graph, outdir):
pdf = graph
mat = PdfMatrix().rotated(45)
stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm')
stream2 = Stream(pdf, b'Q')
pdf.pages[0].page_contents_add(stream1, True)
pdf.pages[0].page_contents_add(stream2, False)
pdf.save(outdir / 'out.pdf')
image there should be a 'cm' that sets up an image coordinate system
where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
page.
PDF units suit our needs so we initialize ctm to the identity matrix.
According to the PDF specification, the maximum stack depth is 32. Other
viewers tolerate some amount beyond this. We issue a warning if the
stack depth exceeds the spec limit and set a hard limit beyond this to
bound our memory requirements. If the stack underflows behavior is
undefined in the spec, but we just pretend nothing happened and leave the
CTM unchanged.
"""
stack = []
ctm = PdfMatrix(initial_shorthand)
xobject_settings = []
inline_images = []
found_vector = False
vector_ops = set('S s f F f* B B* b b*'.split())
image_ops = set('BI ID EI q Q Do cm'.split())
operator_whitelist = ' '.join(vector_ops | image_ops)
for n, graphobj in enumerate(
_normalize_stack(
pikepdf.parse_content_stream(contentstream, operator_whitelist)
)
):
operands, operator = graphobj
if operator == 'q':
stack.append(ctm)
if len(stack) > 32: # See docstring
stack.append(ctm)
if len(stack) > 32: # See docstring
if len(stack) > 128:
raise RuntimeError(
"PDF graphics stack overflowed hard limit, operator %i" % n
)
warn("PDF graphics stack overflowed spec limit")
elif operator == 'Q':
try:
ctm = stack.pop()
except IndexError:
# Keeping the ctm the same seems to be the only sensible thing
# to do. Just pretend nothing happened, keep calm and carry on.
warn("PDF graphics stack underflowed - PDF may be malformed")
elif operator == 'cm':
ctm = PdfMatrix(operands) @ ctm
elif operator == 'Do':
image_name = operands[0]
settings = XobjectSettings(
name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
)
xobject_settings.append(settings)
elif operator == 'INLINE IMAGE': # BI/ID/EI are grouped into this
iimage = operands[0]
inline = InlineSettings(
iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
)
inline_images.append(inline)
elif operator in vector_ops:
found_vector = True
return ContentsInfo(
on one page it may be drawn at differing resolutions, and our objective
is to find the resolution at which the page can be rastered without
downsampling.
"""
if container.get('/Type') == '/Page' and '/Contents' in container:
initial_shorthand = shorthand or UNIT_SQUARE
elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form':
# Set the CTM to the state it was when the "Do" operator was
# encountered that is drawing this instance of the Form XObject
ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()
# A Form XObject may provide its own matrix to map form space into
# user space. Get this if one exists
form_shorthand = container.get('/Matrix', PdfMatrix.identity())
form_matrix = PdfMatrix(form_shorthand)
# Concatenate form matrix with CTM to ensure CTM is correct for
# drawing this instance of the XObject
ctm = form_matrix @ ctm
initial_shorthand = ctm.shorthand
else:
return
contentsinfo = _interpret_contents(container, initial_shorthand)
if contentsinfo.found_vector:
yield VectorInfo()
yield from _find_inline_images(contentsinfo)
yield from _find_regular_images(container, contentsinfo)
yield from _find_form_xobject_images(pdf, container, contentsinfo)
is to find the resolution at which the page can be rastered without
downsampling.
"""
if container.get('/Type') == '/Page' and '/Contents' in container:
initial_shorthand = shorthand or UNIT_SQUARE
elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form':
# Set the CTM to the state it was when the "Do" operator was
# encountered that is drawing this instance of the Form XObject
ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()
# A Form XObject may provide its own matrix to map form space into
# user space. Get this if one exists
form_shorthand = container.get('/Matrix', PdfMatrix.identity())
form_matrix = PdfMatrix(form_shorthand)
# Concatenate form matrix with CTM to ensure CTM is correct for
# drawing this instance of the XObject
ctm = form_matrix @ ctm
initial_shorthand = ctm.shorthand
else:
return
contentsinfo = _interpret_contents(container, initial_shorthand)
if contentsinfo.found_vector:
yield VectorInfo()
yield from _find_inline_images(contentsinfo)
yield from _find_regular_images(container, contentsinfo)
yield from _find_form_xobject_images(pdf, container, contentsinfo)
# content may have a rotation applied. Wrap the text stream with a rotation
# so it will be oriented the same way as the rest of the page content.
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
# -rotation because the input is a clockwise angle and this formula
# uses CCW
rotation = -rotation % 360
rotate = pikepdf.PdfMatrix().rotated(rotation)
# Because of rounding of DPI, we might get a text layer that is not
# identically sized to the target page. Scale to adjust. Normally this
# is within 0.998.
if rotation in (90, 270):
wt, ht = ht, wt
scale_x = wp / wt
scale_y = hp / ht
# log.debug('%r', scale_x, scale_y)
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
# Translate the text so it is centered at (0, 0), rotate it there, adjust
# for a size different between initial and text PDF, then untranslate, and
# finally move the lower left corner to match the mediabox
ctm = translate @ rotate @ scale @ untranslate @ corner
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
base_page = pdf_base.pages.p(page_num)
# The text page always will be oriented up by this stage but the original
# content may have a rotation applied. Wrap the text stream with a rotation
# so it will be oriented the same way as the rest of the page content.
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
# -rotation because the input is a clockwise angle and this formula
# uses CCW
rotation = -rotation % 360
rotate = pikepdf.PdfMatrix().rotated(rotation)
# Because of rounding of DPI, we might get a text layer that is not
# identically sized to the target page. Scale to adjust. Normally this
# is within 0.998.
if rotation in (90, 270):
wt, ht = ht, wt
scale_x = wp / wt
scale_y = hp / ht
# log.debug('%r', scale_x, scale_y)
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
# -rotation because the input is a clockwise angle and this formula
# uses CCW
rotation = -rotation % 360
rotate = pikepdf.PdfMatrix().rotated(rotation)
# Because of rounding of DPI, we might get a text layer that is not
# identically sized to the target page. Scale to adjust. Normally this
# is within 0.998.
if rotation in (90, 270):
wt, ht = ht, wt
scale_x = wp / wt
scale_y = hp / ht
# log.debug('%r', scale_x, scale_y)
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
# Translate the text so it is centered at (0, 0), rotate it there, adjust
# for a size different between initial and text PDF, then untranslate, and
# finally move the lower left corner to match the mediabox
ctm = translate @ rotate @ scale @ untranslate @ corner
pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'
new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)
if strip_old_text:
strip_invisible_text(pdf_base, base_page)
base_page.page_contents_add(new_text_layer, prepend=True)
_update_page_resources(