How to use the tesserocr.RIL.BLOCK function in tesserocr

To help you get started, we’ve selected a few tesserocr examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sirfz / tesserocr / tests / test_api.py View on Github external
def test_layout_getcomponents(self):
        self._api.Init()
        self._api.SetImageFile(self._image_file)
        result = self._api.GetComponentImages(tesserocr.RIL.BLOCK, True)
        # Test if not empty
        self.assertTrue(result)
        _, xywh, _, _ = result[0] # bbox of largest
        self.assertIn('w', xywh)
        self.assertIn('h', xywh)
        area = xywh['w'] * xywh['h']
        # Test if the largest block is quite large
        self.assertGreater(area, 400000)
github sirfz / tesserocr / tests / test_api.py View on Github external
def test_layout_boundingbox(self):
        self._api.Init()
        self._api.SetImageFile(self._image_file)
        layout = self._api.AnalyseLayout()
        # Test if not empty
        self.assertTrue(layout)
        self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK))
        result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest
        self.assertIsNot(result, None)
        x0, y0, x1, y1 = result
        area = (x1 - x0) * (y1 - y0)
        # Test if the largest block is quite large
        self.assertGreater(area, 400000)
github OCR-D / core / ocrd / segment / segmenting.py View on Github external
def segment(self):
        """
        Performs the segmentation.
        """
        with tesserocr.PyTessBaseAPI() as api:
            for ID in self.handle.img_files:
                image = Image.open(self.handle.img_files[ID])
                api.SetImage(image)
                boxes = api.GetComponentImages(tesserocr.RIL.BLOCK, True)
                if len(boxes) > 0:
                    # get XML for ID
                    if ID in self.handle.page_trees:
                        PcGts = self.handle.page_trees[ID].getroot()
                        pages = PcGts.xpath("page:Page", namespaces=ns)
                        if len(pages) > 0:
                            page = pages[0]
                        else:
                            page = ET.SubElement(PcGts,"Page")
                        reading_order = ET.SubElement(page,"ReadingOrder")
                    for i, (im, box, index, _) in enumerate(boxes):

                        # the region reference in the reading order element
                        region_ref = "r%i" % index
                        region_ref_indexed = ET.SubElement(reading_order,"RegionRefIndexed")
                        region_ref_indexed.set("regionRef", region_ref)
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_region.py View on Github external
if not ro:
            ro = ReadingOrderType()
            page.set_ReadingOrder(ro)
        og = ro.get_OrderedGroup()
        if og:
            # start counting from largest existing index
            for elem in (og.get_RegionRefIndexed() +
                         og.get_OrderedGroupIndexed() +
                         og.get_UnorderedGroupIndexed()):
                if elem.index >= index:
                    index = elem.index + 1
        else:
            # new top-level group
            og = OrderedGroupType(id="reading-order")
            ro.set_OrderedGroup(og)
        while it and not it.Empty(RIL.BLOCK):
            # (padding will be passed to both BoundingBox and GetImage)
            # (actually, Tesseract honours padding only on the left and bottom,
            #  whereas right and top are increased less!)
            bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
            # sometimes these polygons are not planar, which causes
            # PIL.ImageDraw.Draw.polygon (and likely others as well)
            # to misbehave; however, PAGE coordinate semantics prohibit
            # multi-path polygons!
            # (probably a bug in Tesseract itself, cf. tesseract#2826):
            if self.parameter['crop_polygons']:
                polygon = it.BlockPolygon()
            else:
                polygon = polygon_from_x0y0x1y1(bbox)
            polygon = coordinates_for_segment(polygon, page_image, page_coords)
            polygon = polygon_for_parent(polygon, page)
            points = points_from_polygon(polygon)
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / binarize.py View on Github external
value=self.parameter[name])
                                                               for name in self.parameter.keys()])]))
                page = pcgts.get_Page()
                page_image, page_xywh, _ = self.workspace.image_from_page(
                    page, page_id)
                LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
                
                regions = page.get_TextRegion() + page.get_TableRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    if oplevel == 'region':
                        tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
                        self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
                                              "region '%s'" % region.id, input_file.pageId,
                                              file_id + '_' + region.id)
                    elif isinstance(region, TextRegionType):
                        lines = region.get_TextLine()
                        if not lines:
                            LOG.warning("Page '%s' region '%s' contains no text lines",
                                        page_id, region.id)
                        for line in lines:
                            line_image, line_xywh = self.workspace.image_from_segment(
                                line, region_image, region_xywh)
                            tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                            self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
                                                  "line '%s'" % line.id, input_file.pageId,
                                                  file_id + '_' + region.id + '_' + line.id)

                # Use input_file's basename for the new file -
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / crop.py View on Github external
LOG.debug("Cropping with tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i",
                              ID, left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info("Ignoring region '%s' because its binarization is empty", ID)
                        continue
                    width = bin_bbox[2]-bin_bbox[0]
                    if width < 25 / zoom:
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_table.py View on Github external
def _process_region(self, it, region, rogroup, region_image, region_coords):
        # equivalent to GetComponentImages with raw_image=True,
        # (which would also give raw coordinates),
        # except we are also interested in the iterator's BlockType() here,
        index = 0
        if rogroup:
            for elem in (rogroup.get_RegionRefIndexed() +
                         rogroup.get_OrderedGroupIndexed() +
                         rogroup.get_UnorderedGroupIndexed()):
                if elem.index >= index:
                    index = elem.index + 1
        while it and not it.Empty(RIL.BLOCK):
            bbox = it.BoundingBox(RIL.BLOCK)
            polygon = polygon_from_x0y0x1y1(bbox)
            polygon = coordinates_for_segment(polygon, region_image, region_coords)
            points = points_from_polygon(polygon)
            coords = CoordsType(points=points)
            # if xywh['w'] < 30 or xywh['h'] < 30:
            #     LOG.info('Ignoring too small region: %s', points)
            #     it.Next(RIL.BLOCK)
            #     continue
            #
            # add the region reference in the reading order element
            # (but ignore non-text regions entirely)
            ID = region.id + "_%04d" % index
            subregion = TextRegionType(id=ID, Coords=coords,
                                       type=TextTypeSimpleType.PARAGRAPH)
            block_type = it.BlockType()
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_region.py View on Github external
if og:
            # start counting from largest existing index
            for elem in (og.get_RegionRefIndexed() +
                         og.get_OrderedGroupIndexed() +
                         og.get_UnorderedGroupIndexed()):
                if elem.index >= index:
                    index = elem.index + 1
        else:
            # new top-level group
            og = OrderedGroupType(id="reading-order")
            ro.set_OrderedGroup(og)
        while it and not it.Empty(RIL.BLOCK):
            # (padding will be passed to both BoundingBox and GetImage)
            # (actually, Tesseract honours padding only on the left and bottom,
            #  whereas right and top are increased less!)
            bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
            # sometimes these polygons are not planar, which causes
            # PIL.ImageDraw.Draw.polygon (and likely others as well)
            # to misbehave; however, PAGE coordinate semantics prohibit
            # multi-path polygons!
            # (probably a bug in Tesseract itself, cf. tesseract#2826):
            if self.parameter['crop_polygons']:
                polygon = it.BlockPolygon()
            else:
                polygon = polygon_from_x0y0x1y1(bbox)
            polygon = coordinates_for_segment(polygon, page_image, page_coords)
            polygon = polygon_for_parent(polygon, page)
            points = points_from_polygon(polygon)
            coords = CoordsType(points=points)
            # if xywh['w'] < 30 or xywh['h'] < 30:
            #     LOG.info('Ignoring too small region: %s', points)
            #     it.Next(RIL.BLOCK)
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_table.py View on Github external
def _process_region(self, it, region, rogroup, region_image, region_coords):
        # equivalent to GetComponentImages with raw_image=True,
        # (which would also give raw coordinates),
        # except we are also interested in the iterator's BlockType() here,
        index = 0
        if rogroup:
            for elem in (rogroup.get_RegionRefIndexed() +
                         rogroup.get_OrderedGroupIndexed() +
                         rogroup.get_UnorderedGroupIndexed()):
                if elem.index >= index:
                    index = elem.index + 1
        while it and not it.Empty(RIL.BLOCK):
            bbox = it.BoundingBox(RIL.BLOCK)
            polygon = polygon_from_x0y0x1y1(bbox)
            polygon = coordinates_for_segment(polygon, region_image, region_coords)
            points = points_from_polygon(polygon)
            coords = CoordsType(points=points)
            # if xywh['w'] < 30 or xywh['h'] < 30:
            #     LOG.info('Ignoring too small region: %s', points)
            #     it.Next(RIL.BLOCK)
            #     continue
            #
            # add the region reference in the reading order element
            # (but ignore non-text regions entirely)
            ID = region.id + "_%04d" % index
            subregion = TextRegionType(id=ID, Coords=coords,
                                       type=TextTypeSimpleType.PARAGRAPH)
            block_type = it.BlockType()
            if block_type == PT.FLOWING_TEXT:
github OCR-D / core / ocrd / processor / segment_region / tesserocr.py View on Github external
def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page = OcrdPage.from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(page.imageFileName)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    box, index = component[1], component[2]
                    # the region reference in the reading order element
                    ID = "r%i" % index
                    page.add_reading_order_ref(ID, index)
                    page.add_textregion(ID, box)
                self.add_output_file(
                    ID=mets_file_id(self.outputGrp, n),
                    input_file=input_file,
                    mimetype=MIMETYPE_PAGE,
                    content=page.to_xml()
                )