Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def orientation_stuff():
api2 = PyTessBaseAPI(psm=PSM.OSD_ONLY, path=MY_TESSDATA_PATH)
api2.SetImageFile('/home/johannes/Repos/tesseract/testing/eurotext.tif')
# os = api2.DetectOS()
os = api2.DetectOrientationScript() # beide verursachen fehler: 'Speicherzugriffsfehler (Speicherabzug geschrieben)'
print("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n Script: {script}\nScript confidence: {sconfidence}".format(**os))
def setUp(self):
if pil_installed:
with open(self._image_file, 'rb') as f:
self._image = Image.open(f)
self._image.load()
self._api = tesserocr.PyTessBaseAPI(init=True)
def tesseract_image(image, languages):
"""Extract text from a binary string of data."""
languages = get_languages(languages)
if not hasattr(thread, 'api'):
thread.api = PyTessBaseAPI(lang=languages,
path=TESSDATA_PREFIX)
else:
thread.api.Init(path=TESSDATA_PREFIX, lang=languages)
try:
# TODO: play with contrast and sharpening the images.
thread.api.SetPageSegMode(PSM.AUTO_OSD)
thread.api.SetImage(image)
return thread.api.GetUTF8Text()
except RuntimeError as re:
log.warning(re)
return None
finally:
thread.api.Clear()
def configure_engine(self, languages):
from tesserocr import PyTessBaseAPI, PSM, OEM
if not hasattr(self.tl, 'api') or self.tl.api is None:
log.info("Configuring OCR engine (%s)", languages)
self.tl.api = PyTessBaseAPI(lang=languages,
oem=OEM.LSTM_ONLY,
psm=PSM.AUTO_OSD)
if languages != self.tl.api.GetInitLanguagesAsString():
log.info("Re-initialising OCR engine (%s)", languages)
self.tl.api.Init(lang=languages, oem=OEM.LSTM_ONLY)
return self.tl.api
def genLines(image=None):
PrintGood('This is going to return OCR on either a list of images or full images')
if isinstance(image, list) == False:
image = PromptList('Which image/images to OCR: ', image)
Found = []
for image in image:
image = Image.open(image)
with PyTessBaseAPI() as api:
api.SetImage(image)
boxes = api.GetComponentImages(RIL.TEXTLINE, True)
print 'Found {} textline image components.'.format(len(boxes))
for i, (im, box, _, _) in enumerate(boxes):
# im is a PIL image object
# box is a dict with x, y, w and h keys
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
ocrResult = api.GetUTF8Text().split(' ')
conf = api.MeanTextConf()
ocrResult = [word.strip() for word in ocrResult]
Found.append(ocrResult)
print (u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, "
"confidence: {1}, text: {2}").format(i, conf, ocrResult, **box)
return Found
def get_api(self, languages):
if not hasattr(self.thread, 'api'):
from tesserocr import PyTessBaseAPI, PSM
api = PyTessBaseAPI(lang=languages)
api.SetPageSegMode(PSM.AUTO_OSD)
self.thread.api = api
return self.thread.api
def run_ocr_engine_for_single_image(dst, borders):
filename = "./images/tmp_{}.png".format(os.getpid())
cv2.imwrite(filename, dst)
image = Image.open(filename)
image.save(filename, dpi=(300, 300))
image = Image.open(filename)
with PyTessBaseAPI(path="./tessdata", lang='tur') as api:
api.SetImage(image)
print "-----------------------"
for border in borders:
x = border[0]
y = border[1]
w = border[2]
h = border[3]
api.SetRectangle(x, y, w, h)
result_item = api.GetUTF8Text()
result_item = result_item.replace("\n\n\n", "\n")
result_item = result_item.replace("\n\n", "\n")
Set up Tesseract to detect blocks, and add each one to the page
as a region according to BlockType at the detected coordinates.
If ``find_tables`` is True, try to detect table blocks and add them
as (atomic) TableRegion.
If ``crop_polygons`` is True, then query polygon outlines instead of
bounding boxes from Tesseract for each region. (This is more precise,
but due to some path representation errors does not always yield
accurate/valid polygons.)
Produce a new output file by serialising the resulting hierarchy.
"""
overwrite_regions = self.parameter['overwrite_regions']
find_tables = self.parameter['find_tables']
with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
if find_tables:
tessapi.SetVariable("textord_tabfind_find_tables", "1") # (default)
# this should yield additional blocks within the table blocks
# from the page iterator, but does not in fact (yet?):
# (and it can run into assertion errors when the table structure
# does not meet certain homogeneity expectations)
#tessapi.SetVariable("textord_tablefind_recognize_tables", "1")
else:
# disable table detection here, so tables will be
# analysed as independent text/line blocks:
tessapi.SetVariable("textord_tabfind_find_tables", "0")
for (n, input_file) in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()