Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
regions = page.get_TextRegion() + page.get_TableRegion()
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if oplevel == 'region':
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
"region '%s'" % region.id, input_file.pageId,
file_id + '_' + region.id)
elif isinstance(region, TextRegionType):
lines = region.get_TextLine()
if not lines:
LOG.warning("Page '%s' region '%s' contains no text lines",
page_id, region.id)
for line in lines:
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
"line '%s'" % line.id, input_file.pageId,
file_id + '_' + region.id + '_' + line.id)
def _process_regions(self, tessapi, regions, page_image, page_xywh):
for region in regions:
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if self.parameter['textequiv_level'] == 'region':
if self.parameter['padding']:
bg = tuple(ImageStat.Stat(region_image).median)
pad = self.parameter['padding']
padded = Image.new(region_image.mode,
(region_image.width + 2 * pad,
region_image.height + 2 * pad), bg)
padded.paste(region_image, (pad, pad))
tessapi.SetImage(padded)
else:
tessapi.SetImage(region_image)
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
#if region.get_primaryScript() not in tessapi.GetLoadedLanguages()...
LOG.debug("Recognizing text in region '%s'", region.id)
region_text = tessapi.GetUTF8Text().rstrip("\n\f")
region_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too
if region.get_TextEquiv():
LOG.warning("Region '%s' already contained text results", region.id)
region.set_TextEquiv([])
# todo: consider SetParagraphSeparator
region.add_TextEquiv(TextEquivType(Unicode=region_text, conf=region_conf))
continue # next region (to avoid indentation below)
## line, word, or glyph level:
textlines = region.get_TextLine()
if not textlines:
LOG.warning("Region '%s' contains no text lines", region.id)
else:
self._process_lines(tessapi, textlines, region_image, region_xywh)
"""Performs (text) line segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the (text) region level,
and remove any existing TextLine elements (unless ``overwrite_lines``
is False).
Set up Tesseract to detect lines, and add each one to the region
at the detected coordinates.
Produce a new output file by serialising the resulting hierarchy.
"""
overwrite_lines = self.parameter['overwrite_lines']
with PyTessBaseAPI(
psm=PSM.SINGLE_BLOCK,
path=TESSDATA_PREFIX
) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()
# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",