Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_set_operations(self):
"""Test advanced set operations"""
items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
items2 = set(["cdefg", "lmnop"])
idx1 = NGram(items1)
idx2 = NGram(items2)
results = lambda L: sorted(x[0] for x in L)
# Item removal
self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
idx1.remove('abcde')
self.assertEqual(results(idx1.search('cde')), ["cdefg"])
# Set intersection operation
items1.remove('abcde')
idx1.intersection_update(idx2)
self.assertEqual(idx1, items1.intersection(items2))
self.assertEqual(results(idx1.search('lmn')), [])
self.assertEqual(results(idx1.search('ijk')), [])
self.assertEqual(results(idx1.search('def')), ['cdefg'])
def test_ngram_search(self):
"""Tests from the original ngram.py, to check that the
rewrite still uses the same underlying algorithm"""
# Basic searching of the index
idx = NGram(self.items)
self.assertEqual(idx.search('askfjwehiuasdfji'),
[('askfjwehiuasdfji', 1.0),
('asdfawe', 0.17391304347826086),
('asfwef', 0.083333333333333329),
('adfwe', 0.041666666666666664),
])
self.assertEqual(idx.search('afadfwe')[:2],
[('adfwe', 0.59999999999999998),
('asdfawe', 0.20000000000000001)])
# Pairwise comparison of strings
self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def getNGrams(text,type,n):
#removes the stop words
tokens = list(tokenize_input(text))
if(type == "word"):
if(len(tokens) > n):
return ["".join(j) for j in zip(*[tokens[i:] for i in range(n)])]
else:
#returns the word directly if n is greater than number of words
a=[]
a.append(text)
return a
if(type == "character"):
ngramObject = ngram.NGram(N=n)
ngram_char_tokens = list(ngramObject.split(text))
## remove first n-1 and last n-1 tokens as they are not complete they have $ signs
if len(text) > n:
return ngram_char_tokens[n-1:(len(ngram_char_tokens)-(n-1))]
else:
tokens = []
tokens.append(text)
return tokens
else:
return list(tokenize_input(text))
def getNGrams(text,type,n):
#removes the stop words
tokens = list(tokenize_input(text))
if(type == "word"):
if(len(tokens) > n):
return ["".join(j) for j in zip(*[tokens[i:] for i in range(n)])]
else:
#returns the word directly if n is greater than number of words
a=[]
a.append(text)
return a
if(type == "character"):
ngramObject = ngram.NGram(N=n)
ngram_char_tokens = list(ngramObject.split(text))
if len(text) > n:
return ngram_char_tokens
else:
a = []
a.append(text)
return a
else:
return list(tokenize_input(text))
def check_all_dict(self, ner_conv, cb_data, cb_data_order, cb_data_th):
"""
check other dict when failed to find matching value
:param ner_conv:
:return:
"""
result = []
for key in cb_data_order:
model = ngram.NGram(key=self.lower)
model.update(cb_data.get(key))
result = list(map(lambda x: x[0], model.search(ner_conv, threshold=cb_data_th[key])))[0:4]
if len(result) > 0:
return result, key
return result, None
def __reduce__(self):
"""Return state information for pickling, no references to this
instance. The key function must be None, a builtin function, or
a named module-level function.
>>> from ngram import NGram
>>> n = NGram([0xDEAD, 0xBEEF], key=hex)
>>> import pickle
>>> p = pickle.dumps(n)
>>> m = pickle.loads(p)
>>> sorted(list(m))
[48879, 57005]
"""
return NGram, (list(self), self.threshold, self.warp, self._key,
self.N, self._pad_len, self._pad_char)
self.param['conninfo'] = netconf['conninfo']
if self.param['datatype'] == 'file':
self.get_file_data(data_node)
elif self.param['datatype'] == 'db':
self.get_db_data()
item = []
for val in self.param['list']:
try:
item_tuple = (val['item_code'].strip(), val['item_leaf'].strip(), val['item_desc'].strip())
item.append(item_tuple)
except:
logging.info('Error Data' + val['item_code'])
dataset = ngram.NGram(item, key=lambda x: x[2])
dataset = sorted(dataset, key=lambda x: x[0])
findset = ngram.NGram(item, key=lambda x: x[2])
logging.info('================================================================================================')
return_data = {}
for data in dataset:
findset.remove(data)
result = findset.search(data[2], self.param['standard'])
for r in range(len(result)):
if return_data.get(data[0]) == None:
return_data[data[0]] = {}
return_data[data[0]]['desc'] = data[2]
# logging.info(str(data[0]) + ':' + str(data[2]))
return_data[data[0]][result[r][0][0]] = {'item_desc': result[r][0][2], 'item_perc': result[r][1]}
# logging.info(' - '+str(result[r][0][0])+'('+str(result[r][1])+')' + ':' + str(result[r][0][2]))