Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_rectangle(image):
# print("TESSDATA_PREFIX path: ", os.environ['TESSDATA_PREFIX'])
# print("tesserocr tesseract-version is:", tesserocr.tesseract_version()) # print tesseract-ocr version
# print("getlanguages result is:", tesserocr.get_languages()) # prints tessdata path and list of available languages
api.SetImage(image)
boxes = api.GetComponentImages(RIL.TEXTLINE, True)
print('Found {} textline image components.'.format(len(boxes)))
for i, (im, box, _, _) in enumerate(boxes):
# im is a PIL image object
# box is a dict with x, y, w and h keys
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
ocrResult = api.GetUTF8Text()
conf = api.MeanTextConf()
print(ocrResult)
def get_boxes_with_info():
image = Image.open('/home/johannes/Repos/tesseract/testing/phototest.tif')
api.SetImage(image)
boxes = api.GetComponentImages(RIL.TEXTLINE, True)
print('Found {} textline image components.'.format(len(boxes)))
for i, (im, box, _, _) in enumerate(boxes):
# im is a PIL image object
# box is a dict with x, y, w and h keys
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
ocrResult = api.GetUTF8Text()
awc = api.AllWordConfidences()
awrds = api.AllWords()
conf = api.MeanTextConf()
print(u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, confidence: {1}, text: {2}".format(i, conf, ocrResult, **box))
def genLines(image=None):
PrintGood('This is going to return OCR on either a list of images or full images')
if isinstance(image, list) == False:
image = PromptList('Which image/images to OCR: ', image)
Found = []
for image in image:
image = Image.open(image)
with PyTessBaseAPI() as api:
api.SetImage(image)
boxes = api.GetComponentImages(RIL.TEXTLINE, True)
print 'Found {} textline image components.'.format(len(boxes))
for i, (im, box, _, _) in enumerate(boxes):
# im is a PIL image object
# box is a dict with x, y, w and h keys
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
ocrResult = api.GetUTF8Text().split(' ')
conf = api.MeanTextConf()
ocrResult = [word.strip() for word in ocrResult]
Found.append(ocrResult)
print (u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, "
"confidence: {1}, text: {2}").format(i, conf, ocrResult, **box)
return Found
def Spaces(image=None):
PrintGood('This returns the number of spaces in a specific image or images')
if isinstance(image, list) == False:
image = PromptList('Which image/images to Scan: ', image)
for image in image:
image = Image.open(image)
with PyTessBaseAPI() as api:
api.SetImage(image)
boxes = api.GetComponentImages(RIL.TEXTLINE, True)
Spaces = 0
for i, (im, box, _, _) in enumerate(boxes):
im.save('saving{}.jpg'.format(i))
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
ocrResult = api.GetUTF8Text()
conf = api.MeanTextConf()
text = str(ocrResult).replace('\n', '').split(' ')
Spaces = len(text) + Spaces
return int(Spaces)
def ocr_extract(api, image, strings):
"""Extract closes matches from an image"""
api.SetImage(image)
for item in api.GetComponentImages(RIL.TEXTLINE, True):
api.SetRectangle(
item[1]['x'], item[1]['y'], item[1]['w'], item[1]['h']
)
ocr_result = api.GetUTF8Text()
parts = [ocr_result] + ocr_result.split('|') + ocr_result.split()
for part in parts:
for match in difflib.get_close_matches(part, strings, cutoff=0.9):
yield match
api.Clear()
italic=word_attributes['italic']
if 'italic' in word_attributes else None,
underlined=word_attributes['underlined']
if 'underlined' in word_attributes else None,
monospace=word_attributes['monospace']
if 'monospace' in word_attributes else None,
serif=word_attributes['serif']
if 'serif' in word_attributes else None)
word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
# add word annotation unconditionally (i.e. even for glyph level):
word.add_TextEquiv(TextEquivType(
Unicode=result_it.GetUTF8Text(RIL.WORD),
conf=result_it.Confidence(RIL.WORD)/100))
if self.parameter['textequiv_level'] != 'word':
self._process_glyphs_in_word(result_it, word, line_xywh)
if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
break
else:
word_no += 1
result_it.Next(RIL.WORD)
with tesserocr.PyTessBaseAPI() as api:
for ID in self.handle.img_files:
image = cv2.imread(self.handle.img_files[ID])
if ID in self.handle.page_trees:
PcGts = self.handle.page_trees[ID].getroot()
pages = PcGts.xpath("page:Page", namespaces=ns)
for page in pages[0:1]:
regions = page.xpath("TextRegion")
for region in regions:
points = region.xpath("Coords")[0].get("points")
polygon = self._polygon_from_points(points)
poly = numpy.array(polygon,numpy.int32)
region_cut = image[numpy.min(poly[:,1]):numpy.max(poly[:,1]),numpy.min(poly[:,0]):numpy.max(poly[:,0])]
region_img = Image.fromarray(region_cut)
api.SetImage(region_img)
lines = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True)
for i, (im, box, index, _) in enumerate(lines):
line = ET.SubElement(region, "TextLine")
coords = ET.SubElement(line, "Coords")
coords.set("points",self._points_from_box(box))
for region in itertools.chain.from_iterable(
[page.get_TextRegion()] +
[subregion.get_TextRegion() for subregion in page.get_TableRegion()]):
if region.get_TextLine():
if overwrite_lines:
LOG.info('removing existing TextLines in region "%s"', region.id)
region.set_TextLine([])
else:
LOG.warning('keeping existing TextLines in region "%s"', region.id)
LOG.debug("Detecting lines in region '%s'", region.id)
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords)
region_polygon = coordinates_of_segment(region, region_image, region_coords)
region_poly = Polygon(region_polygon)
tessapi.SetImage(region_image)
for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
line_id = '%s_line%04d' % (region.id, line_no)
line_polygon = polygon_from_xywh(component[1])
line_poly = Polygon(line_polygon)
if not line_poly.within(region_poly):
# this could happen due to rotation
interline = line_poly.intersection(region_poly)
if interline.is_empty:
continue # ignore this line
if hasattr(interline, 'geoms'):
# is (heterogeneous) GeometryCollection
area = 0
for geom in interline.geoms:
if geom.area > area:
area = geom.area
interline = geom
if not area:
def process(self):
"""
Performs the line segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image_url = page.imageFileName
for region in page.list_textregions():
log.debug("Detecting lines in %s with tesseract", region)
image = self.workspace.resolve_image_as_pil(image_url, region.coords)
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True):
region.add_textline(coords=component[1])
self.add_output_file(
ID=mets_file_id(self.outputGrp, n),
input_file=input_file,
mimetype=MIMETYPE_PAGE,
content=page.to_xml()
)