Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def split(self, mode, index, wi):
if mode is tokenizer.Tokenizer.SplitMode.A:
word_ids = wi.a_unit_split
elif mode is tokenizer.Tokenizer.SplitMode.B:
word_ids = wi.b_unit_split
else:
return [self.__getitem__(index)]
if len(word_ids) == 0 or len(word_ids) == 1:
return [self.__getitem__(index)]
offset = self.path[index].get_begin()
nodes = []
for wid in word_ids:
n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
n.set_begin(offset)
offset += n.get_word_info().head_word_length
n.set_end(offset)
nodes.append(n)
return MorphemeList(self.input_text, self.grammar, self.lexicon, nodes)
return []
builder = utf8inputtextbuilder.UTF8InputTextBuilder(text, self.grammar)
for plugin in self.input_text_plugins:
plugin.rewrite(builder)
input_ = builder.build()
bytes_ = input_.get_byte_text()
self.lattice.resize(len(bytes_))
for i in range(len(bytes_)):
if not input_.is_char_alignment(i) or not self.lattice.has_previous_node(i):
continue
iterator = self.lexicon.lookup(bytes_, i)
has_words = True if iterator else False
for word_id, end in iterator:
n = latticenode.LatticeNode(self.lexicon,
self.lexicon.get_left_id(word_id),
self.lexicon.get_right_id(word_id),
self.lexicon.get_cost(word_id),
word_id)
self.lattice.insert(i, end, n)
# OOV
if categorytype.CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
for oov_plugin in self.oov_provider_plugins:
for node in oov_plugin.get_oov(input_, i, has_words):
has_words = True
self.lattice.insert(node.get_begin(), node.get_end(), node)
if not has_words and self.default_oov_provider:
for node in self.default_oov_provider.get_oov(input_, i, has_words):
has_words = True
self.lattice.insert(node.get_begin(), node.get_end(), node)
for plugin in self.path_rewrite_plugins:
plugin.rewrite(input_, path, self.lattice)
if mode is not self.SplitMode.C:
new_path = []
for node in path:
if mode is self.SplitMode.A:
wids = node.get_word_info().a_unit_split
else: # self.SplitMode.B
wids = node.get_word_info().b_unit_split
if len(wids) == 0 or len(wids) == 1:
new_path.append(node)
else:
offset = node.get_begin()
for wid in wids:
n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
n.begin = offset
offset += n.get_word_info().head_word_length
n.end = offset
new_path.append(n)
path = new_path
# dump_output
ml = morphemelist.MorphemeList(input_, self.grammar, self.lexicon, path)
return ml
def create_node(self):
node = latticenode.LatticeNode()
node.set_oov()
return node
def create_node() -> LatticeNode:
return LatticeNode()
def create_node(self):
node = latticenode.LatticeNode()
node.set_oov()
return node
def resize(self, size: int) -> None:
if size > self.capacity:
self.expand(size)
self.size = size
self.eos_node = LatticeNode()
self.eos_node.set_parameter(self.eos_params[0], self.eos_params[1], self.eos_params[2])
self.eos_node.begin = self.eos_node.end = size
def resize(self, size):
if size > self.capacity:
self.expand(size)
self.size = size
self.eos_node = latticenode.LatticeNode()
self.eos_node.set_parameter(self.eos_params[0], self.eos_params[1], self.eos_params[2])
self.eos_node.begin = self.eos_node.end = size
def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeNode]:
if mode == self.SplitMode.C:
return path
new_path = []
for node in path:
if mode is self.SplitMode.A:
wids = node.get_word_info().a_unit_split
else:
wids = node.get_word_info().b_unit_split
if len(wids) <= 1:
new_path.append(node)
else:
offset = node.get_begin()
for wid in wids:
n = LatticeNode(self._lexicon, 0, 0, 0, wid)
n.begin = offset
offset += n.get_word_info().head_word_length
n.end = offset
new_path.append(n)
return new_path