Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from it and then creates a MinHash object from every
remaining character in the domain.
If a domain starts with www., it will be stripped of the
domain before the Minhash is calculated.
Args:
domain: string with a full domain, eg. www.google.com
Returns:
A minhash (instance of datasketch.minhash.MinHash)
"""
domain_items = domain.split('.')
domain_part = '.'.join(domain_items[:-1])
minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
for char in domain_part:
minhash.update(char.encode('utf8'))
return minhash
def eg1():
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
m1.update(d.encode('utf8'))
for d in set2:
m2.update(d.encode('utf8'))
for d in set3:
m3.update(d.encode('utf8'))
# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with Jaccard similarity > 0.5", result)
def _run_minhash(A, B, data, seed, bs, num_perm):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
m1 = MinHash(num_perm=num_perm, hashobj=Hash)
m2 = MinHash(num_perm=num_perm, hashobj=Hash)
for i in xrange(a_start, a_end):
m1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
m2.update(hasher(data[i], seed=seed))
return [m1.jaccard(m2)] + \
[_b_bit_minhash_jaccard(m1, m2, b) for b in bs]
def _run_minhash(A, B, data, seed, p):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
m1 = MinHash(num_perm=2**p, hashobj=Hash)
m2 = MinHash(num_perm=2**p, hashobj=Hash)
for i in xrange(a_start, a_end):
m1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
m2.update(hasher(data[i], seed=seed))
return _minhash_inclusion(m1, m2)
def _run_minhash(A, B, data, seed, p):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
m1 = MinHash(num_perm=2**p, hashobj=Hash)
m2 = MinHash(num_perm=2**p, hashobj=Hash)
for i in xrange(a_start, a_end):
m1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
m2.update(hasher(data[i], seed=seed))
return _minhash_inclusion(m1, m2)
def _run_acc(size, seed, num_perm):
m = MinHash(num_perm=num_perm)
s = set()
random.seed(seed)
for i in range(size):
v = int_bytes(random.randint(1, size))
m.update(v)
s.add(v)
return (m, s)
def eg1():
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
m1.update(d.encode('utf8'))
for d in set2:
m2.update(d.encode('utf8'))
for d in set3:
m3.update(d.encode('utf8'))
# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with Jaccard similarity > 0.5", result)
def eg1():
m1 = MinHash()
m2 = MinHash()
for d in data1:
m1.update(d.encode('utf8'))
for d in data2:
m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
s1 = set(data1)
s2 = set(data2)
actual_jaccard = float(len(s1.intersection(s2))) /\
float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def _run_minhash(data, seed, p):
hasher = pyhash.murmur3_32()
m = MinHash(num_perm=2**p, hashobj=Hash)
for d in data:
m.update(hasher(d, seed=seed))
return m.count()