Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
n = len(self.range_list)
end = n
pivot = (begin + end) // 2
while 0 <= pivot < n:
range_ = self.range_list[pivot]
if range_.contains(code_point):
return range_.categories
if range_.lower(code_point):
begin = pivot
else: # range_.higher(code_point)
end = pivot
new_pivot = (begin + end) // 2
if new_pivot == pivot:
break
pivot = new_pivot
return {categorytype.CategoryType.DEFAULT}
def test_get_char_category_types(self):
input_ = self.builder.build()
self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(0))
self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(2))
self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(5))
self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(6))
self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(7))
self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(9))
self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(10))
self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(13))
self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(18))
self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(19))
self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(22))
self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(23))
self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(26))
self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(31))
def test_read_oov(self):
oov = os.path.join(self.test_dir, 'test.txt')
with open(oov, 'w') as wf:
wf.write("DEFAULT,1,2,3,補助記号,一般,*,*,*,*\n")
wf.write("DEFAULT,3,4,5,補助記号,一般,*,*,*,*\n")
plugin = MeCabOovPlugin()
plugin.categories[CategoryType.DEFAULT] = MeCabOovPlugin.CategoryInfo()
plugin.read_oov(oov, mock_grammar.mocked_grammar)
self.assertEqual(1, len(plugin.oov_list))
self.assertEqual(2, len(plugin.oov_list[CategoryType.DEFAULT]))
self.assertEqual(1, plugin.oov_list[CategoryType.DEFAULT][0].left_id)
self.assertEqual(2, plugin.oov_list[CategoryType.DEFAULT][0].right_id)
self.assertEqual(3, plugin.oov_list[CategoryType.DEFAULT][0].cost)
self.assertEqual(0, plugin.oov_list[CategoryType.DEFAULT][0].pos_id)
raise AttributeError("invalid format at line {}".format(i))
if not re.match("0x", cols[0]):
continue
range_ = self.Range()
r = re.split("\\.\\.", cols[0])
range_.low = int(r[0], 16)
range_.high = range_.low + 1
if len(r) > 1:
range_.high = int(r[1], 16) + 1
if range_.low >= range_.high:
f.close()
raise AttributeError("invalid range at line {}".format(i))
for j in range(1, len(cols)):
if re.match("#", cols[j]) or cols[j] == '':
break
type_ = categorytype.CategoryType.get(cols[j])
if type_ is None:
f.close()
raise AttributeError("{} is invalid type at line {}".format(cols[j], i))
range_.categories.append(type_)
self.range_list.append(range_)
f.close()
self._compile()
def get_type(self, id_):
for type_ in CategoryType.values():
if type_.get_id() is id_:
return type_
return None
def can_oov_bow_node(self, text, node):
return CategoryType.NOOOVBOW not in text.get_char_category_types(node.get_begin())
def rewrite(self, text, path, lattice):
begin_index = -1
comma_as_digit = True
period_as_digit = True
parser = NumericParser()
i = -1
while i < len(path) - 1:
i += 1
node = path[i]
types = self.get_char_category_types(text, node)
s = node.get_word_info().normalized_form
if CategoryType.NUMERIC in types or CategoryType.KANJINUMERIC in types or \
(period_as_digit and s == '.') or (comma_as_digit and s == ','):
if begin_index < 0:
parser.clear()
begin_index = i
for c in s:
if not parser.append(c):
if begin_index >= 0:
if parser.error_state == NumericParser.Error.COMMA:
comma_as_digit = False
i = begin_index - 1
elif parser.error_state == NumericParser.Error.POINT:
period_as_digit = False
i = begin_index - 1
begin_index = -1
def get_category_types(self, code_point):
for range_ in self.range_list:
if range_.contains(code_point):
return range_.categories
return set(categorytype.CategoryType.DEFAULT)
continue
iterator = self._lexicon.lookup(bytes_, i)
has_words = False
for word_id, end in iterator:
if (end < len(bytes_)) and (not input_.can_bow(end)):
continue
has_words = True
n = LatticeNode(self._lexicon,
self._lexicon.get_left_id(word_id),
self._lexicon.get_right_id(word_id),
self._lexicon.get_cost(word_id),
word_id)
self._lattice.insert(i, end, n)
# OOV
if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
for oov_plugin in self._oov_provider_plugins:
for node in oov_plugin.get_oov(input_, i, has_words):
has_words = True
self._lattice.insert(node.get_begin(), node.get_end(), node)
if not has_words and self.default_oov_provider:
for node in self.default_oov_provider.get_oov(input_, i, has_words):
has_words = True
self._lattice.insert(node.get_begin(), node.get_end(), node)
if not has_words:
raise RuntimeError("there is no morpheme at " + str(i))
self._lattice.connect_eos_node()