Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_unconstrained_inverted_index(self):
blocker = dedupe.blocking.Blocker([dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")])
blocker.index(set(record["name"]
for record
in viewvalues(self.data_d)),
"name")
blocks = defaultdict(set)
for block_key, record_id in blocker(self.data_d.items()) :
blocks[block_key].add(record_id)
blocks = set([frozenset(block) for block in blocks.values()
if len(block) > 1 ])
assert blocks ==\
set([frozenset([120, 125]), frozenset([130, 135])])
assert False
#candidates = [(source[k1], ref[k2])
# for k1, k2
# in blocked_sample_keys | random_sample_keys]
#candidates = [(source_items[k1], ref_items[k2])
# for predicate, (k1, k2)
# in blocked_sample_keys]
compound_length = 1
blocker = blocking.Blocker(my_predicates)
#blocker.indexAll({i : record
# for i, record
# in enumerate(unroll(candidates))})
#
#dupe_cover = cover(blocker, candidates, compound_length)
#
#
#tab = pd.DataFrame([[key, val] for key, val in dupe_cover_count.items()], columns=['predicate', 'count'])
#tab['col'] = tab.predicate.apply(get_col)
#tab.groupby('col')['count'].mean() # meaningfull columns
def invert_dupe_cover(dupe_cover):
inv_dupe_cover = defaultdict(set)
for key, matches in dupe_cover.items():
def test_unconstrained_inverted_index(self):
blocker = dedupe.blocking.Blocker(
[dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")])
blocker.index(set(record["name"]
for record
in viewvalues(self.data_d)),
"name")
blocks = defaultdict(set)
for block_key, record_id in blocker(self.data_d.items()):
blocks[block_key].add(record_id)
blocks = set([frozenset(block) for block in blocks.values()
if len(block) > 1])
assert blocks ==\
def test_dedupe_coverage(self) :
predicates = self.data_model.predicates()
blocker = dedupe.blocking.Blocker(predicates)
dedupe.training.prepare_index(blocker, self.training_pairs, "Dedupe")
coverage = dedupe.training.coveredBy(blocker.predicates, self.training)
assert self.simple(coverage.keys()).issuperset(
set(["SimplePredicate: (tokenFieldPredicate, name)",
"SimplePredicate: (commonSixGram, name)",
"TfidfTextCanopyPredicate: (0.4, name)",
"SimplePredicate: (sortedAcronym, name)",
"SimplePredicate: (sameThreeCharStartPredicate, name)",
"TfidfTextCanopyPredicate: (0.2, name)",
"SimplePredicate: (sameFiveCharStartPredicate, name)",
"TfidfTextCanopyPredicate: (0.6, name)",
"SimplePredicate: (wholeFieldPredicate, name)",
"TfidfTextCanopyPredicate: (0.8, name)",
"SimplePredicate: (commonFourGram, name)",
"SimplePredicate: (firstTokenPredicate, name)",
def setUp(self) :
data_d = {
100 : {"name": "Bob", "age": "50", "dataset": 0},
105 : {"name": "Charlie", "age": "75", "dataset": 1},
110 : {"name": "Meredith", "age": "40", "dataset": 1},
115 : {"name": "Sue", "age": "10", "dataset": 0},
120 : {"name": "Jimbo", "age": "21","dataset": 0},
125 : {"name": "Jimbo", "age": "21", "dataset": 0},
130 : {"name": "Willy", "age": "35", "dataset": 0},
135 : {"name": "Willy", "age": "35", "dataset": 1},
140 : {"name": "Martha", "age": "19", "dataset": 1},
145 : {"name": "Kyle", "age": "27", "dataset": 0},
}
self.blocker = dedupe.blocking.Blocker([dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")])
self.records_1 = dict((record_id, record)
for record_id, record
in viewitems(data_d)
if record["dataset"] == 0)
self.fields_2 = dict((record_id, record["name"])
for record_id, record
in viewitems(data_d)
if record["dataset"] == 1)
def test_dedupe_coverage(self):
predicates = self.data_model.predicates()
blocker = dedupe.blocking.Blocker(predicates)
blocker.indexAll({i: x for i, x in enumerate(self.training_records)})
coverage = training.Cover(blocker.predicates,
self.training)
assert self.simple(coverage.keys()).issuperset(
set(["SimplePredicate: (tokenFieldPredicate, name)",
"SimplePredicate: (commonSixGram, name)",
"TfidfTextCanopyPredicate: (0.4, name)",
"SimplePredicate: (sortedAcronym, name)",
"SimplePredicate: (sameThreeCharStartPredicate, name)",
"TfidfTextCanopyPredicate: (0.2, name)",
"SimplePredicate: (sameFiveCharStartPredicate, name)",
"TfidfTextCanopyPredicate: (0.6, name)",
"SimplePredicate: (wholeFieldPredicate, name)",
"TfidfTextCanopyPredicate: (0.8, name)",
"SimplePredicate: (commonFourGram, name)",
"SimplePredicate: (firstTokenPredicate, name)",
def _trainBlocker(self, maximum_comparisons, recall, index_predicates): # pragma: no cover
matches = self.training_pairs['match'][:]
predicate_set = self.data_model.predicates(index_predicates,
self.canopies)
block_learner = self._blockLearner(predicate_set)
self.predicates = block_learner.learn(matches,
maximum_comparisons,
recall)
self.blocker = blocking.Blocker(self.predicates)
def __init__(self, predicates, sampled_records, data):
compound_length = 2
N = sampled_records.original_length
N_s = len(sampled_records)
self.r = (N * (N - 1)) / (N_s * (N_s - 1))
self.blocker = blocking.Blocker(predicates)
self.blocker.indexAll(data)
simple_cover = self.coveredPairs(self.blocker, sampled_records)
compound_predicates = self.compound(simple_cover, compound_length)
self.comparison_count = self.comparisons(compound_predicates,
simple_cover)
def canopyOverlap(self,
tfidf_predicates,
record_pairs) :
# uniquify records
docs = list(set(itertools.chain(*record_pairs)))
id_records = list(itertools.izip(itertools.count(), docs))
record_ids = dict(itertools.izip(docs, itertools.count()))
blocker = Blocker()
blocker.tfidf_predicates = tfidf_predicates
blocker.tfIdfBlocks(id_records)
for (threshold, field) in blocker.tfidf_predicates:
canopy = blocker.canopies[threshold.__name__ + field]
for record_1, record_2 in record_pairs :
id_1 = record_ids[record_1]
id_2 = record_ids[record_2]
if canopy[id_1] == canopy[id_2]:
self.overlapping[(threshold, field)].add((record_1, record_2))
self.blocks[(threshold, field)][canopy[id_1]].add((record_1, record_2))