Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_result_iterator(self):
"""Test result iterator."""
self._api.SetImageFile(self._image_file)
self._api.Recognize()
it = self._api.GetIterator()
level = tesserocr.RIL.WORD
for i, w in enumerate(tesserocr.iterate_level(it, level)):
text = w.GetUTF8Text(level)
blanks = w.BlanksBeforeWord()
if i == 0:
self.assertEqual(text, "The")
self.assertEqual(blanks, 0)
elif i == 1:
self.assertEqual(text, "(quick)")
self.assertEqual(blanks, 1)
else:
break
def _process_words_in_line(self, result_it, line, line_xywh):
if not result_it or result_it.Empty(RIL.WORD):
LOG.warning("No text in line '%s'", line.id)
return
# iterate until IsAtFinalElement(RIL.LINE, RIL.WORD):
word_no = 0
while result_it and not result_it.Empty(RIL.WORD):
word_id = '%s_word%04d' % (line.id, word_no)
LOG.debug("Decoding text in word '%s'", word_id)
bbox = result_it.BoundingBox(RIL.WORD)
# convert to absolute coordinates:
polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
None, line_xywh) - self.parameter['padding']
points = points_from_polygon(polygon)
word = WordType(id=word_id, Coords=CoordsType(points))
line.add_Word(word)
# todo: determine if font attributes available for word level will work with LSTM models
word_attributes = result_it.WordFontAttributes()
polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
None, word_xywh) - self.parameter['padding']
points = points_from_polygon(polygon)
glyph = GlyphType(id=glyph_id, Coords=CoordsType(points))
word.add_Glyph(glyph)
choice_it = result_it.GetChoiceIterator()
for (choice_no, choice) in enumerate(choice_it):
alternative_text = choice.GetUTF8Text()
alternative_conf = choice.Confidence()/100
#LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or
choice_no > CHOICE_THRESHOLD_NUM):
break
# todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
break
else:
glyph_no += 1
result_it.Next(RIL.SYMBOL)
fontFamily=word_attributes['font_name']
if 'font_name' in word_attributes else None,
bold=word_attributes['bold']
if 'bold' in word_attributes else None,
italic=word_attributes['italic']
if 'italic' in word_attributes else None,
underlined=word_attributes['underlined']
if 'underlined' in word_attributes else None,
monospace=word_attributes['monospace']
if 'monospace' in word_attributes else None,
serif=word_attributes['serif']
if 'serif' in word_attributes else None)
word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
# add word annotation unconditionally (i.e. even for glyph level):
word.add_TextEquiv(TextEquivType(
Unicode=result_it.GetUTF8Text(RIL.WORD),
conf=result_it.Confidence(RIL.WORD)/100))
if self.parameter['textequiv_level'] != 'word':
self._process_glyphs_in_word(result_it, word, line_xywh)
if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
break
else:
word_no += 1
result_it.Next(RIL.WORD)
for region in page.get_TextRegion():
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords)
for line in region.get_TextLine():
if line.get_Word():
if overwrite_words:
LOG.info('removing existing Words in line "%s"', line.id)
line.set_Word([])
else:
LOG.warning('keeping existing Words in line "%s"', line.id)
LOG.debug("Detecting words in line '%s'", line.id)
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords)
tessapi.SetImage(line_image)
for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)):
word_id = '%s_word%04d' % (line.id, word_no)
word_polygon = polygon_from_xywh(component[1])
word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords)
word_points = points_from_polygon(word_polygon)
line.add_Word(WordType(
id=word_id, Coords=CoordsType(word_points)))
# Use input_file's basename for the new file -
# this way the files retain the same basenames:
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,