Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_uncovered_by(self):
before = {1: {1, 2, 3}, 2: {1, 2}, 3: {3}}
after = {1: {1, 2}, 2: {1, 2}}
before_copy = before.copy()
assert training.BranchBound.uncovered_by(before, set()) == before
assert training.BranchBound.uncovered_by(before, {3}) == after
assert before == before_copy
def test_dedupe_coverage(self) :
predicates = self.data_model.predicates()
blocker = dedupe.blocking.Blocker(predicates)
dedupe.training.prepare_index(blocker, self.training_pairs, "Dedupe")
coverage = dedupe.training.coveredBy(blocker.predicates, self.training)
assert self.simple(coverage.keys()).issuperset(
set(["SimplePredicate: (tokenFieldPredicate, name)",
"SimplePredicate: (commonSixGram, name)",
"TfidfTextCanopyPredicate: (0.4, name)",
"SimplePredicate: (sortedAcronym, name)",
"SimplePredicate: (sameThreeCharStartPredicate, name)",
"TfidfTextCanopyPredicate: (0.2, name)",
"SimplePredicate: (sameFiveCharStartPredicate, name)",
"TfidfTextCanopyPredicate: (0.6, name)",
"SimplePredicate: (wholeFieldPredicate, name)",
"TfidfTextCanopyPredicate: (0.8, name)",
"SimplePredicate: (commonFourGram, name)",
"SimplePredicate: (firstTokenPredicate, name)",
"SimplePredicate: (sameSevenCharStartPredicate, name)"]))
candidates,
data_1,
data_2,
original_length_1,
original_length_2,
index_include):
super().__init__(data_model, candidates)
sampled_records_1 = Sample(data_1, 600, original_length_1)
index_data = Sample(data_2, 50000, original_length_2)
sampled_records_2 = Sample(index_data, 600, original_length_2)
preds = self.data_model.predicates(canopies=False)
self.block_learner = training.RecordLinkBlockLearner(preds,
sampled_records_1,
sampled_records_2,
index_data)
examples_to_index = candidates
if index_include:
candidates += index_include
self._index_predicates(examples_to_index)
'state': {'type': 'String'},
'zip': {'type': 'String'},
}
deduper = dedupe.Dedupe(fields)
# Sometimes we will want to add additional labeled examples to a
# training file. To do this can just load the existing labeled
# pairs...
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
deduper.train(data_samples, training_file)
print 'starting active labeling...'
print 'finding uncertain pairs...'
# ... and then call training with our interactive function
deduper.train(data_samples, dedupe.training.consoleLabel)
deduper.writeTraining(training_file)
print 'blocking...'
t_block = time.time()
blocker = deduper.blockingFunction(eta=0.001, epsilon=5)
deduper.writeSettings(settings_file)
print 'blocked in', time.time() - t_block, 'seconds'
# So the learning is done and we have our blocker. However we cannot
# block the data in memory. We have to pass through all the data and
# create a blocking map table.
#
# First though, if we learned a tf-idf predicate, we have to create an
# tfIDF blocks for the full data set.
print 'creating inverted index'
full_data = ((row['donor_id'], row) for row in con.execute(donor_select))
a record dictionary.
In in the record dictionary the keys are the names of the
record field and values are the record values.
"""
self.data_model = datamodel.DataModel(variable_definition)
if num_cores is None:
self.num_cores = multiprocessing.cpu_count()
else:
self.num_cores = num_cores
if data_sample:
self._checkDataSample(data_sample)
self.data_sample = data_sample
self.activeLearner = training.ActiveLearning(self.data_sample,
self.data_model,
self.num_cores)
else:
self.data_sample = []
self.activeLearner = None
# Override _loadSampledRecords() to load blocking data from
# data_sample.
self._loadSampledRecords(data_sample)
training_dtype = [('label', 'S8'),
('distances', 'f4',
(len(self.data_model), ))]
self.training_data = numpy.zeros(0, dtype=training_dtype)
self.training_pairs = OrderedDict({u'distinct': [],
# 1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
# }
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
deduper.train(data_sample, training_file)
# ## Active learning
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(training_file)
# Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize the blocker
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(ppc,
dupes,
# 1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
# }
if os.path.exists(r_training_file):
print 'reading labeled examples from ', r_training_file
deduper.train(data_sample, r_training_file)
# ## Active learning
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,