Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_layout_getcomponents(self):
self._api.Init()
self._api.SetImageFile(self._image_file)
result = self._api.GetComponentImages(tesserocr.RIL.BLOCK, True)
# Test if not empty
self.assertTrue(result)
_, xywh, _, _ = result[0] # bbox of largest
self.assertIn('w', xywh)
self.assertIn('h', xywh)
area = xywh['w'] * xywh['h']
# Test if the largest block is quite large
self.assertGreater(area, 400000)
def test_layout_boundingbox(self):
self._api.Init()
self._api.SetImageFile(self._image_file)
layout = self._api.AnalyseLayout()
# Test if not empty
self.assertTrue(layout)
self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK))
result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest
self.assertIsNot(result, None)
x0, y0, x1, y1 = result
area = (x1 - x0) * (y1 - y0)
# Test if the largest block is quite large
self.assertGreater(area, 400000)
def segment(self):
"""
Performs the segmentation.
"""
with tesserocr.PyTessBaseAPI() as api:
for ID in self.handle.img_files:
image = Image.open(self.handle.img_files[ID])
api.SetImage(image)
boxes = api.GetComponentImages(tesserocr.RIL.BLOCK, True)
if len(boxes) > 0:
# get XML for ID
if ID in self.handle.page_trees:
PcGts = self.handle.page_trees[ID].getroot()
pages = PcGts.xpath("page:Page", namespaces=ns)
if len(pages) > 0:
page = pages[0]
else:
page = ET.SubElement(PcGts,"Page")
reading_order = ET.SubElement(page,"ReadingOrder")
for i, (im, box, index, _) in enumerate(boxes):
# the region reference in the reading order element
region_ref = "r%i" % index
region_ref_indexed = ET.SubElement(reading_order,"RegionRefIndexed")
region_ref_indexed.set("regionRef", region_ref)
if not ro:
ro = ReadingOrderType()
page.set_ReadingOrder(ro)
og = ro.get_OrderedGroup()
if og:
# start counting from largest existing index
for elem in (og.get_RegionRefIndexed() +
og.get_OrderedGroupIndexed() +
og.get_UnorderedGroupIndexed()):
if elem.index >= index:
index = elem.index + 1
else:
# new top-level group
og = OrderedGroupType(id="reading-order")
ro.set_OrderedGroup(og)
while it and not it.Empty(RIL.BLOCK):
# (padding will be passed to both BoundingBox and GetImage)
# (actually, Tesseract honours padding only on the left and bottom,
# whereas right and top are increased less!)
bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
# sometimes these polygons are not planar, which causes
# PIL.ImageDraw.Draw.polygon (and likely others as well)
# to misbehave; however, PAGE coordinate semantics prohibit
# multi-path polygons!
# (probably a bug in Tesseract itself, cf. tesseract#2826):
if self.parameter['crop_polygons']:
polygon = it.BlockPolygon()
else:
polygon = polygon_from_x0y0x1y1(bbox)
polygon = coordinates_for_segment(polygon, page_image, page_coords)
polygon = polygon_for_parent(polygon, page)
points = points_from_polygon(polygon)
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
regions = page.get_TextRegion() + page.get_TableRegion()
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if oplevel == 'region':
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
"region '%s'" % region.id, input_file.pageId,
file_id + '_' + region.id)
elif isinstance(region, TextRegionType):
lines = region.get_TextLine()
if not lines:
LOG.warning("Page '%s' region '%s' contains no text lines",
page_id, region.id)
for line in lines:
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
"line '%s'" % line.id, input_file.pageId,
file_id + '_' + region.id + '_' + line.id)
# Use input_file's basename for the new file -
LOG.debug("Cropping with tesseract")
tessapi.SetImage(page_image)
# PSM.SPARSE_TEXT: get as much text as possible in no particular order
# PSM.AUTO (default): includes tables (dangerous)
tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
#
# helper variables for saving the box coordinates
#
min_x = page_image.width
min_y = page_image.height
max_x = 0
max_y = 0
# iterate over all text blocks and compare their
# bbox extent to the running min and max values
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
image, xywh, index, _ = component
#
# the region reference in the reading order element
#
ID = "region%04d" % index
left, top, right, bottom = bbox_from_xywh(xywh)
LOG.debug("Detected text region '%s': %i:%i,%i:%i",
ID, left, right, top, bottom)
# filter region results:
bin_bbox = image.getbbox()
if not bin_bbox:
# this does happen!
LOG.info("Ignoring region '%s' because its binarization is empty", ID)
continue
width = bin_bbox[2]-bin_bbox[0]
if width < 25 / zoom:
def _process_region(self, it, region, rogroup, region_image, region_coords):
# equivalent to GetComponentImages with raw_image=True,
# (which would also give raw coordinates),
# except we are also interested in the iterator's BlockType() here,
index = 0
if rogroup:
for elem in (rogroup.get_RegionRefIndexed() +
rogroup.get_OrderedGroupIndexed() +
rogroup.get_UnorderedGroupIndexed()):
if elem.index >= index:
index = elem.index + 1
while it and not it.Empty(RIL.BLOCK):
bbox = it.BoundingBox(RIL.BLOCK)
polygon = polygon_from_x0y0x1y1(bbox)
polygon = coordinates_for_segment(polygon, region_image, region_coords)
points = points_from_polygon(polygon)
coords = CoordsType(points=points)
# if xywh['w'] < 30 or xywh['h'] < 30:
# LOG.info('Ignoring too small region: %s', points)
# it.Next(RIL.BLOCK)
# continue
#
# add the region reference in the reading order element
# (but ignore non-text regions entirely)
ID = region.id + "_%04d" % index
subregion = TextRegionType(id=ID, Coords=coords,
type=TextTypeSimpleType.PARAGRAPH)
block_type = it.BlockType()
if og:
# start counting from largest existing index
for elem in (og.get_RegionRefIndexed() +
og.get_OrderedGroupIndexed() +
og.get_UnorderedGroupIndexed()):
if elem.index >= index:
index = elem.index + 1
else:
# new top-level group
og = OrderedGroupType(id="reading-order")
ro.set_OrderedGroup(og)
while it and not it.Empty(RIL.BLOCK):
# (padding will be passed to both BoundingBox and GetImage)
# (actually, Tesseract honours padding only on the left and bottom,
# whereas right and top are increased less!)
bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
# sometimes these polygons are not planar, which causes
# PIL.ImageDraw.Draw.polygon (and likely others as well)
# to misbehave; however, PAGE coordinate semantics prohibit
# multi-path polygons!
# (probably a bug in Tesseract itself, cf. tesseract#2826):
if self.parameter['crop_polygons']:
polygon = it.BlockPolygon()
else:
polygon = polygon_from_x0y0x1y1(bbox)
polygon = coordinates_for_segment(polygon, page_image, page_coords)
polygon = polygon_for_parent(polygon, page)
points = points_from_polygon(polygon)
coords = CoordsType(points=points)
# if xywh['w'] < 30 or xywh['h'] < 30:
# LOG.info('Ignoring too small region: %s', points)
# it.Next(RIL.BLOCK)
def _process_region(self, it, region, rogroup, region_image, region_coords):
# equivalent to GetComponentImages with raw_image=True,
# (which would also give raw coordinates),
# except we are also interested in the iterator's BlockType() here,
index = 0
if rogroup:
for elem in (rogroup.get_RegionRefIndexed() +
rogroup.get_OrderedGroupIndexed() +
rogroup.get_UnorderedGroupIndexed()):
if elem.index >= index:
index = elem.index + 1
while it and not it.Empty(RIL.BLOCK):
bbox = it.BoundingBox(RIL.BLOCK)
polygon = polygon_from_x0y0x1y1(bbox)
polygon = coordinates_for_segment(polygon, region_image, region_coords)
points = points_from_polygon(polygon)
coords = CoordsType(points=points)
# if xywh['w'] < 30 or xywh['h'] < 30:
# LOG.info('Ignoring too small region: %s', points)
# it.Next(RIL.BLOCK)
# continue
#
# add the region reference in the reading order element
# (but ignore non-text regions entirely)
ID = region.id + "_%04d" % index
subregion = TextRegionType(id=ID, Coords=coords,
type=TextTypeSimpleType.PARAGRAPH)
block_type = it.BlockType()
if block_type == PT.FLOWING_TEXT:
def process(self):
"""
Performs the region segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image = self.workspace.resolve_image_as_pil(page.imageFileName)
log.debug("Detecting regions with tesseract")
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
box, index = component[1], component[2]
# the region reference in the reading order element
ID = "r%i" % index
page.add_reading_order_ref(ID, index)
page.add_textregion(ID, box)
self.add_output_file(
ID=mets_file_id(self.outputGrp, n),
input_file=input_file,
mimetype=MIMETYPE_PAGE,
content=page.to_xml()
)