Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
space_type = 'leven'
space_param = []
method_name = 'small_world_rand'
index_name = method_name + '.index'
if os.path.isfile(index_name):
os.remove(index_name)
self.index = nmslib.init(
space_type,
space_param,
method_name,
nmslib.DataType.OBJECT_AS_STRING,
nmslib.DistType.INT)
def test_string_loaded():
DATA_STRS = ["xyz", "beagcfa", "cea", "cb",
"d", "c", "bdaf", "ddcd",
"egbfa", "a", "fba", "bcccfe",
"ab", "bfgbfdc", "bcbbgf", "bfbb"
]
QUERY_STRS = ["abc", "def", "ghik"]
space_type = 'leven'
space_param = []
method_name = 'small_world_rand'
index_name = method_name + '.index'
index = nmslib.init(
space_type,
space_param,
method_name,
nmslib.DataType.OBJECT_AS_STRING,
nmslib.DistType.INT)
for id, data in enumerate(DATA_STRS):
nmslib.addDataPoint(index, id, data)
print('Let\'s print a few data entries')
print('We have added %d data points' % nmslib.getDataPointQty(index))
for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
print(nmslib.getDataPoint(index,i))
print('Let\'s invoke the index-build process')
index = nmslib.init(
space_type,
space_param,
method_name,
nmslib.DataType.DENSE_VECTOR,
nmslib.DistType.FLOAT)
with TimeIt('fast_batch add data point'):
offset = 0
for data in read_data_fast_batch(f, 10000):
nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data)
offset += data.shape[0]
print('offset', offset)
nmslib.freeIndex(index)
if seq:
index = nmslib.init(
space_type,
space_param,
method_name,
nmslib.DataType.DENSE_VECTOR,
nmslib.DistType.FLOAT)
with TimeIt('seq add data point'):
for id, data in enumerate(read_data(f)):
nmslib.addDataPoint(index, id, data)
nmslib.freeIndex(index)
def test_vector_fresh(fast=True):
space_type = 'cosinesimil'
space_param = []
method_name = 'small_world_rand'
index_name = method_name + '.index'
if os.path.isfile(index_name):
os.remove(index_name)
index = nmslib.init(
space_type,
space_param,
method_name,
nmslib.DataType.DENSE_VECTOR,
nmslib.DistType.FLOAT)
start = time.time()
if fast:
data = read_data_fast('sample_dataset.txt')
print('data.shape', data.shape)
positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data)
else:
for id, data in enumerate(read_data('sample_dataset.txt')):
pos = nmslib.addDataPoint(index, id, data)
if id != pos:
print('id %s != pos %s' % (id, pos))
if query_params is None:
query_params = {
"efSearch": 100
}
# create index
ind_pm_key = sorted([(k, v) for k, v in index_params.items()
if k in ["efConstruction", "M",
"delaunay_type", "post"]])
ind_pm_key.append(("metric", metric))
ind_pm_key.append(("use_pca", use_pca))
str_ind_pm_key = str(ind_pm_key)
if str_ind_pm_key in self._hnsw_index_lut:
hnsw = self._hnsw_index_lut[str_ind_pm_key]
else:
hnsw = nmslib.init(method="hnsw", space=metric,
data_type=data_type)
hnsw.addDataPointBatch(data_x)
hnsw.createIndex(index_params, print_progress=verbose)
self._hnsw_index_lut[str_ind_pm_key] = hnsw
# query KNN
hnsw.setQueryTimeParams(query_params)
# k nearest neighbors
# hnsw query may include self.
compute_k = k + 1
knns = hnsw.knnQueryBatch(
data_x, k=compute_k, num_threads=self._nprocs)
# print(knns)
# construct knn conn mat.
knn_targets_sep_l = []
knn_weights_sep_l = []
# need benchmark
desc = desc.astype(np.float32)
desc /= (desc.sum(axis=1, keepdims=True) + 1e-7)
desc = np.sqrt(desc)
desc_p = pow(desc, 2)
desc /= np.sqrt(desc_p.sum(axis=1,
keepdims=True) + 1e-7)
for i in range(desc.shape[0]):
desc_list.append(desc[i, :])
image_index += 1
desc_list = np.array(desc_list)
expand = 3
nn = 30
index = nmslib.init(space='l2', method='hnsw')
index.addDataPointBatch(data=desc_list)
index.createIndex(
print_progress=True,
index_params={
"maxM": 32,
"maxM0": 64,
"indexThreadQty": 24})
index.setQueryTimeParams(params={"ef": nn * expand})
query_result = []
pbar = tqdm(dataset.query_list)
filter_flag = False
for query_idx, image_filepath in enumerate(pbar):
if len(image_filepath) == 5:
image_filepath, left, top, bottom, right = image_filepath
def __init__(self, feats, k, index_path='', verbose=True):
import nmslib
self.verbose = verbose
with Timer('[hnsw] build index', verbose):
""" higher ef leads to better accuracy, but slower search
higher M leads to higher accuracy/run_time at fixed ef, but consumes more memory
"""
# space_params = {
# 'ef': 100,
# 'M': 16,
# }
# index = nmslib.init(method='hnsw', space='cosinesimil', space_params=space_params)
index = nmslib.init(method='hnsw', space='cosinesimil')
if index_path != '' and os.path.isfile(index_path):
index.loadIndex(index_path)
else:
index.addDataPointBatch(feats)
index.createIndex({'post': 2, 'indexThreadQty': 1}, print_progress=verbose)
if index_path:
print('[hnsw] save index to {}'.format(index_path))
mkdir_if_no_exists(index_path)
index.saveIndex(index_path)
with Timer('[hnsw] query topk {}'.format(k), verbose):
knn_ofn = index_path + '.npz'
if os.path.exists(knn_ofn):
print('[hnsw] read knns from {}'.format(knn_ofn))
self.knns = [(knn[0, :].astype(np.int32), knn[1, :].astype(np.float32)) \
for knn in np.load(knn_ofn)['data']]
else: