Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
raise ValueError("num_part must be at least 1")
if m < 2 or m > num_perm:
raise ValueError("m must be in the range of [2, num_perm]")
if any(w < 0.0 or w > 1.0 for w in weights):
raise ValueError("Weight must be in [0.0, 1.0]")
if sum(weights) != 1.0:
raise ValueError("Weights must sum to 1.0")
self.threshold = threshold
self.h = num_perm
self.m = m
rs = self._init_optimal_params(weights)
# Initialize multiple LSH indexes for each partition
storage_config = {'type': 'dict'} if not storage_config else storage_config
basename = storage_config.get('basename', _random_name(11))
self.indexes = [
dict((r, MinHashLSH(
num_perm=self.h,
params=(int(self.h/r), r),
storage_config=self._get_storage_config(
basename, storage_config, partition, r),
prepickle=prepickle)) for r in rs)
for partition in range(0, num_part)]
self.lowers = [None for _ in self.indexes]
self.uppers = [None for _ in self.indexes]
def eg1():
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
m1.update(d.encode('utf8'))
for d in set2:
m2.update(d.encode('utf8'))
for d in set3:
m3.update(d.encode('utf8'))
# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with Jaccard similarity > 0.5", result)
1.0. The initialized MinHash LSH will be optimized for the
threshold by minizing the false positive and false negative.
Returns:
A tuple with an LSH (instance of datasketch.lsh.LSH) and a
dictionary with event ID as key and minhash as value.
"""
if delimiters is None:
delimiters = DEFAULT_DELIMITERS
if num_perm is None:
num_perm = DEFAULT_PERMUTATIONS
if threshold is None:
threshold = DEFAULT_THRESHOLD
minhashes = {}
lsh = MinHashLSH(threshold, num_perm)
with lsh.insertion_session() as lsh_session:
for event in events:
# Insert minhash in LSH index.
key = (event.event_id, event.event_type, event.index_name)
minhash = minhash_from_text(
event.source[field], num_perm, delimiters)
minhashes[key] = minhash
lsh_session.insert(key, minhash)
return lsh, minhashes
def eg2():
mg = WeightedMinHashGenerator(10, 5)
m1 = mg.minhash(v1)
m2 = mg.minhash(v2)
m3 = mg.minhash(v3)
print("Estimated Jaccard m1, m2", m1.jaccard(m2))
print("Estimated Jaccard m1, m3", m1.jaccard(m3))
# Create LSH index
lsh = MinHashLSH(threshold=0.1, num_perm=5)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)