Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def provide_oov(self, input_text, offset, has_other_words):
if not has_other_words:
node = self.create_node()
node.set_parameter(self.left_id, self.right_id, self.cost)
length = input_text.get_code_points_offset_length(offset, 1)
s = input_text.get_substring(offset, offset + length)
info = wordinfo.WordInfo(surface=s, head_word_length=length, pos_id=self.oov_pos_id, normalized_form=s,
dictionary_form_word_id=-1, dictionary_form=s, reading_form="",
a_unit_split=[], b_unit_split=[], word_structure=[])
node.set_word_info(info)
return [node]
else:
return []
def provide_oov(self, input_text, offset, has_other_words):
if not has_other_words:
node = self.create_node()
node.set_parameter(self.left_id, self.right_id, self.cost)
length = input_text.get_word_candidate_length(offset)
s = input_text.get_substring(offset, offset + length)
info = wordinfo.WordInfo(surface=s, head_word_length=length, pos_id=self.oov_pos_id, normalized_form=s,
dictionary_form_word_id=-1, dictionary_form=s, reading_form="",
a_unit_split=[], b_unit_split=[], word_structure=[])
node.set_word_info(info)
return [node]
else:
return []
if not normalized_form:
normalized_form = surface
dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True)
reading_form = self.buffer_to_string()
a_unit_split = self.buffer_to_int_array()
b_unit_split = self.buffer_to_int_array()
word_structure = self.buffer_to_int_array()
dictionary_form = surface
if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id:
wi = self.get_word_info(dictionary_form_word_id)
dictionary_form = wi.surface
self.bytes.seek(orig_pos)
return wordinfo.WordInfo(surface, head_word_length, pos_id, normalized_form,
dictionary_form_word_id, dictionary_form, reading_form,
a_unit_split, b_unit_split, word_structure)
def get_oov_node(self, text, oov, length):
node = self.create_node()
node.set_parameter(oov.left_id, oov.right_id, oov.cost)
info = wordinfo.WordInfo(surface=text, head_word_length=length, pos_id=oov.pos_id, normalized_form=text,
dictionary_form_word_id=-1, dictionary_form=text, reading_form="",
a_unit_split=[], b_unit_split=[], word_structure=[])
node.set_word_info(info)
return node
raise ValueError('invalid part of speech')
entry.aunit_split_string = cols[15]
entry.bunit_split_string = cols[16]
entry.cunit_split_string = cols[17]
self.check_splitinfo_format(entry.aunit_split_string)
self.check_splitinfo_format(entry.bunit_split_string)
self.check_splitinfo_format(entry.cunit_split_string)
if cols[14] == 'A' and \
not (entry.aunit_split_string == '*' and entry.bunit_split_string == '*'):
raise ValueError('invalid splitting')
head_length = len(cols[0].encode('utf-8'))
dict_from_wordid = -1 if cols[13] == '*' else int(cols[13])
entry.wordinfo = WordInfo(
cols[4], head_length, pos_id, cols[12], dict_from_wordid, '', cols[11], None, None, None)
return entry
def concatenate_oov(self, path, begin, end, pos_id, lattice):
if begin >= end:
raise IndexError("begin >= end")
b = path[begin].get_begin()
e = path[end - 1].get_end()
surface = ""
length = 0
for i in range(begin, end):
info = path[i].get_word_info()
surface += info.surface
length += info.head_word_length
wi = WordInfo(surface=surface, head_word_length=length, pos_id=pos_id,
normalized_form=surface, dictionary_form=surface, dictionary_form_word_id=-1,
reading_form="", a_unit_split=[], b_unit_split=[], word_structure=[])
node = lattice.create_node()
node.set_range(b, e)
node.set_word_info(wi)
path[begin:end] = [node]
return node