Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def print_stats(
f, show=None, skip_unique=False, max_int_value=5, duration_limit=None,
print_duplicates=False, print_urls=False, limit=None):
stats = Counter()
if not skip_unique:
lsh = MinHashLSH(threshold=0.9, num_perm=128)
too_common = get_too_common_shingles(f, limit=1000)
urls = {}
min_timestamp = max_timestamp = None
for i, item in enumerate(item_reader(f, limit=limit)):
if print_urls:
print(item['url'])
content_type = item.get('content_type', 'missing')
stats.update([
'content_type: ' + content_type,
'content_type[0]: ' + content_type.split('/')[0]])
if min_timestamp is None:
min_timestamp = item['timestamp']
max_timestamp = item['timestamp']
if duration_limit and \
(max_timestamp - min_timestamp) / 1000 > duration_limit:
break
altnid_sid_dict = dict([(tmp[0], tmp[1]) for tmp in altn])
altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn])
sid_sname_dict = dict([(tmp[1], tmp[3]) for tmp in altn])
sid_altnid_dict = {}
for nid, sid in altnid_sid_dict.items():
sid_altnid_dict.setdefault(sid, [])
sid_altnid_dict[sid].append(nid)
print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict)))
perms = 512
gram_sz = 3
minhashes = {}
lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms)
print("Building lsh minhash data structure")
with ProcessPoolExecutor(max_workers=8) as ex:
print("Submitting jobs")
futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
for
key, content
in
altnid_name_dict.items()
if
len(content) >= 5
]
print("Consuming futures")
for key, future in tqdm.tqdm(futures):
minhash = future.result()
# Create minhashes
minhashes = {}
for rid in records:
m = MinHash(num_perm=self._num_perm)
for d in records[rid]:
qgrams = set(self.nt.basic(d, 2))
for gram in qgrams:
m.update(gram.encode('utf-8'))
minhashes[rid] = m
# Create LSH instance and add min hashes
if self._bands == MinHashLSHRecordDeduplication.BANDS and self._rows == MinHashLSHRecordDeduplication.ROWS:
lsh = MinHashLSH(threshold=self._threshold,num_perm=self._num_perm)
else:
lsh = MinHashLSH(num_perm=self._num_perm, params=(self._bands, self._rows))
max_blocks = []
for rid in records:
lsh.insert(rid, minhashes[rid])
max_blocks.append(rid)
# Generate blocks
while(len(max_blocks)>0):
key = max_blocks[0]
bucket = lsh.query(minhashes[key])
for rid in bucket:
if rid in max_blocks:
max_blocks.remove(rid)
indexer["b"+str(self._block_index)].append(rid)
self._block_index += 1
def analyze_file(name, f, verbose=False):
urls = []
Doc = namedtuple('Doc', ['item', 'min_hash'])
documents = {} # key -> Doc
lsh = MinHashLSH(threshold=0.9, num_perm=128)
too_common = get_too_common_shingles(f, name, limit=300)
for i, item in enumerate(item_reader(f, name)):
urls.append(item['url'])
min_hash = get_min_hash(item['extracted_text'], too_common)
key = 'item_{}'.format(i)
item = {'url': item['url']}
documents[key] = Doc(item, min_hash)
if key in lsh:
lsh.remove(key)
lsh.insert(key, min_hash)
paths = [''.join([p.netloc, p.path]) for p in map(urlsplit, urls)]
duplicates = get_duplicates(lsh, documents, verbose=verbose)
print(name.ljust(40), '\t'.join(map(str, [
len(urls), len(set(urls)), len(set(paths)),
n_unique(documents, duplicates),
])))
m3 = MinHash(num_perm=128)
for d in data1:
m1.update(d.encode('utf8'))
for d in data2:
m1.update(d.encode('utf8'))
for d in data3:
m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)
# Insert m2 and m3 into the index
lsh.insert("m2", m2)
lsh.insert("m3", m3)
# Check for membership using the key
print("m2" in lsh)
print("m3" in lsh)
# Using m1 as the query, retrieve the keys of the qualifying datasets
result = lsh.query(m1)
print("Candidates with Jaccard similarity > 0.5", result)
# Remove key from lsh
lsh.remove("m2")
def construct_lsh(obj_dict):
lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
# forest = MinHashLSHForest(num_perm=128)
keys = obj_dict.keys()
values = obj_dict.values()
ms = []
for i in range(len(keys)):
temp = MinHash(num_perm=128)
for d in values[i]:
temp.update(d.encode('utf8'))
ms.append(temp)
lsh_0.insert(keys[i], temp)
lsh_5.insert(keys[i], temp)
return lsh_0,lsh_5, keys, ms
def benchmark_lsh(num_perm, threshold, index_data, query_data):
print("Building LSH index")
lsh = MinHashLSH(threshold, num_perm)
for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
lsh.insert(key, minhash)
print("Querying")
times = []
results = []
for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
start = time.clock()
result = lsh.query(minhash)
duration = time.clock() - start
times.append(duration)
results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
for key in result],
key=lambda x : x[1], reverse=True))
return times, results
for each page.
:param storage_config: configuration for a redis backend to persist
minhashes in. Using this backend makes DupePredictor instances
persistent across restarts. The configuration format is:
storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}}.
See https://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
"""
self.jaccard_threshold = jaccard_threshold
self.num_perm = num_perm
self.storage_config = storage_config
if storage_config:
self.lsh = MinHashLSH(
threshold=self.jaccard_threshold, num_perm=self.num_perm,
storage_config=self.storage_config)
else:
self.lsh = MinHashLSH(
threshold=self.jaccard_threshold, num_perm=self.num_perm)
self.too_common_shingles = set()
if texts_sample:
self.too_common_shingles = get_too_common_shingles(texts_sample)
self.seen_urls = {} # url: URLMeta
self.urls_by_path = defaultdict(set) # path: {url}
self.urls_by_path_q = defaultdict(set) # (path, q): {url}
self.urls_by_path_qwp = defaultdict(set) # (path, param, q): {url}
self.params_by_path = defaultdict(set) # path: {param}
self.param_values = defaultdict(set) # (path, param): {value}
# Duplicate hypotheses:
# (1) All items with same path are duplicates. Key is (path,)
self.path_dupstats = defaultdict(DupStat)
# (2) All items with same path that differ only in given param are
def construct_lsh(obj_dict):
lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
# forest = MinHashLSHForest(num_perm=128)
keys = obj_dict.keys()
values = obj_dict.values()
ms = []
for i in range(len(keys)):
temp = MinHash(num_perm=128)
for d in values[i]:
temp.update(d.encode('utf8'))
ms.append(temp)
lsh_0.insert(keys[i], temp)
lsh_5.insert(keys[i], temp)
return lsh_0,lsh_5, keys, ms
def learn_duplicates(name, f, verbose=False):
print(name)
logging.basicConfig(level=logging.DEBUG)
texts_sample = [
item['extracted_text'] for item in item_reader(f, name, limit=300)]
dupe_predictor = DupePredictor(texts_sample)
lsh = MinHashLSH(threshold=0.9, num_perm=128) # separate from dupe_predictor
too_common_shingles = dupe_predictor.too_common_shingles
threshold = 0.98
y_pred, y_true = [], []
def _report_pr():
tp = sum(p > threshold and d for p, d in zip(y_pred, y_true))
fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true))
fn = sum(p < threshold and d for p, d in zip(y_pred, y_true))
n_dup = tp + fn
print('precision: %.3f, recall %.3f at %.2f threshold '
'(%d duplicates)' % (
tp / (tp + fp) if tp else 0.,
tp / n_dup if n_dup else 0., threshold, n_dup))
for i, item in enumerate(item_reader(f, name)):
dupe_prob = dupe_predictor.get_dupe_prob(item['url'])
y_pred.append(dupe_prob)
min_hash = get_min_hash(item['extracted_text'], too_common_shingles)