Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _run_hyperloglog(A, B, data, seed, p):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
h1 = HyperLogLog(p=p, hashobj=Hash)
h2 = HyperLogLog(p=p, hashobj=Hash)
for i in xrange(a_start, a_end):
h1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
h2.update(hasher(data[i], seed=seed))
return _hyperloglog_jaccard(h1, h2)
def _run_hyperloglog(A, B, data, seed, p):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
h1 = HyperLogLog(p=p, hashobj=Hash)
h2 = HyperLogLog(p=p, hashobj=Hash)
for i in xrange(a_start, a_end):
h1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
h2.update(hasher(data[i], seed=seed))
return _hyperloglog_inclusion(h1, h2)
def _run_hyperloglog(A, B, data, seed, p):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
h1 = HyperLogLog(p=p, hashobj=Hash)
h2 = HyperLogLog(p=p, hashobj=Hash)
for i in xrange(a_start, a_end):
h1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
h2.update(hasher(data[i], seed=seed))
return _hyperloglog_jaccard(h1, h2)
def eg1():
h = HyperLogLog()
for d in data1:
h.update(d.encode('utf8'))
print("Estimated cardinality is", h.count())
s1 = set(data1)
print("Actual cardinality is", len(s1))
def run_perf(card, p):
h = HyperLogLog(p=p)
logging.info("HyperLogLog using p = %d " % p)
start = time.clock()
for i in range(card):
h.update(int_bytes(i))
duration = time.clock() - start
logging.info("Digested %d hashes in %.4f sec" % (card, duration))
return duration
def _run_hyperloglog(A, B, data, seed, p):
(a_start, a_end), (b_start, b_end) = A, B
hasher = pyhash.murmur3_32()
h1 = HyperLogLog(p=p, hashobj=Hash)
h2 = HyperLogLog(p=p, hashobj=Hash)
for i in xrange(a_start, a_end):
h1.update(hasher(data[i], seed=seed))
for i in xrange(b_start, b_end):
h2.update(hasher(data[i], seed=seed))
return _hyperloglog_inclusion(h1, h2)
size = struct.calcsize('B')
try:
p = struct.unpack_from('B', buf, 0)[0]
except TypeError:
p = struct.unpack_from('B', buffer(buf), 0)[0]
self.__init__(p=p)
offset = size
try:
self.reg = np.array(struct.unpack_from('%dB' % self.m,
buf, offset), dtype=np.int8)
except TypeError:
self.reg = np.array(struct.unpack_from('%dB' % self.m,
buffer(buf), offset), dtype=np.int8)
class HyperLogLogPlusPlus(HyperLogLog):
'''
HyperLogLog++ is an enhanced HyperLogLog `from Google
`_.
Main changes from the original HyperLogLog:
1. Use 64 bits instead of 32 bits for hash function
2. A new small-cardinality estimation scheme
3. Sparse representation (not implemented here)
Args:
p (int, optional): The precision parameter. It is ignored if
the `reg` is given.
reg (numpy.array, optional): The internal state.
This argument is for initializing the HyperLogLog from
an existing one.
hashfunc (optional): The hash function used by this MinHash.
def run_acc(size, seed, p):
logging.info("HyperLogLog using p = %d " % p)
h = HyperLogLog(p=p)
s = set()
random.seed(seed)
for i in range(size):
v = int_bytes(random.randint(1, size))
h.update(v)
s.add(v)
perr = abs(float(len(s)) - h.count()) / float(len(s))
return perr