Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
line, region_image, region_xywh)
# todo: Tesseract works better if the line images have a 5px margin everywhere
if self.parameter['padding']:
bg = tuple(ImageStat.Stat(line_image).median)
pad = self.parameter['padding']
padded = Image.new(line_image.mode,
(line_image.width + 2 * pad,
line_image.height + 2 * pad), bg)
padded.paste(line_image, (pad, pad))
tessapi.SetImage(padded)
else:
tessapi.SetImage(line_image)
if self.parameter['raw_lines']:
tessapi.SetPageSegMode(PSM.RAW_LINE)
else:
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
#if line.get_primaryScript() not in tessapi.GetLoadedLanguages()...
LOG.debug("Recognizing text in line '%s'", line.id)
if self.parameter['textequiv_level'] == 'line':
line_text = tessapi.GetUTF8Text().rstrip("\n\f")
line_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too
if line.get_TextEquiv():
LOG.warning("Line '%s' already contained text results", line.id)
line.set_TextEquiv([])
# todo: consider BlankBeforeWord, SetLineSeparator
line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
continue # next line (to avoid indentation below)
## word, or glyph level:
words = line.get_Word()
if words:
## external word layout:
LOG.warning("Line '%s' contains words already, recognition might be suboptimal", line.id)
def process(self):
"""
Performs the (text) recognition.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1])
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file)
self.workspace.download_file(input_file)
page = OcrdPage.from_file(input_file)
image_url = page.imageFileName
log.info("page %s", page)
for region in page.list_textregions():
textlines = region.list_textlines()
log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.ID)
for (line_no, line) in enumerate(textlines):
log.debug("Recognizing text in region '%s' line '%s'", region.ID, line_no)
# TODO use binarized / gray
image = self.workspace.resolve_image_as_pil(image_url, line.coords)
tessapi.SetImage(image)
line.textequiv = tessapi.GetUTF8Text()
self.add_output_file(
black_pixel_count = image[image == 0].size
white_pixel_count = image[image == 1].size
if black_pixel_count > white_pixel_count:
image = skimage.util.invert(image)
image = skimage.morphology.closing(image, skimage.morphology.rectangle(1, horizontal_closing))
image = skimage.morphology.closing(image, skimage.morphology.rectangle(vertical_closing, 1))
image = skimage.util.img_as_ubyte(image)
if is_unix():
return tesserocr.image_to_text(
Image.fromarray(image),
psm=tesserocr.PSM.SINGLE_LINE,
oem=tesserocr.OEM.TESSERACT_ONLY
).strip()
elif is_windows():
return pytesseract.image_to_string(Image.fromarray(image))
def ocr_line(im, chars):
# avoid Tesseract trying to "learn" anything
tessapi.InitFull()
tessapi.SetVariable("tessedit_char_whitelist", chars);
tessapi.SetImage(im)
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
return tessapi.GetUTF8Text()
# -*- coding: utf-8 -*-
from __future__ import print_function, division
from PIL import Image
import math
from imgdata import *
from pokedata import *
from common import Rectangle, Point
import tesserocr
import sys
tessapi = tesserocr.PyTessBaseAPI(psm=tesserocr.PSM.SINGLE_LINE)
def ocr_line(im, chars):
# avoid Tesseract trying to "learn" anything
tessapi.InitFull()
tessapi.SetVariable("tessedit_char_whitelist", chars);
tessapi.SetImage(im)
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
return tessapi.GetUTF8Text()
def read_level(im, trainer_level):
circle_r = MeterBounds.w / 2
circle_center = Point(MeterBounds.x + circle_r, MeterBounds.y + circle_r)
ballr = MeterBallRadius
# half-levels start from 0
hlvl_max = trainer_level * 2 + 1
cpm_min = LevelStats[0]["cpmulti"]
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if oplevel == 'region':
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
"region '%s'" % region.id, input_file.pageId,
file_id + '_' + region.id)
elif isinstance(region, TextRegionType):
lines = region.get_TextLine()
if not lines:
LOG.warning("Page '%s' region '%s' contains no text lines",
page_id, region.id)
for line in lines:
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
"line '%s'" % line.id, input_file.pageId,
file_id + '_' + region.id + '_' + line.id)
# Use input_file's basename for the new file -
# this way the files retain the same basenames:
file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.page_grp, n)
self.workspace.add_file(
ID=file_id,
file_grp=self.page_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename=os.path.join(self.page_grp,
file_id + '.xml'),
"""Performs word segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the textline level,
and remove any existing Word elements (unless ``overwrite_words``
is False).
Set up Tesseract to detect words, and add each one to the line
at the detected coordinates.
Produce a new output file by serialising the resulting hierarchy.
"""
overwrite_words = self.parameter['overwrite_words']
with PyTessBaseAPI(
psm=PSM.SINGLE_LINE,
path=TESSDATA_PREFIX
) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()
# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",