Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_markPair(self) :
from collections import OrderedDict
good_training_pairs = OrderedDict((('distinct', DATA_SAMPLE[0:3]),
('match', DATA_SAMPLE[3:5])))
bad_training_pairs = {'non_dupes' : DATA_SAMPLE[0:3],
'match' : DATA_SAMPLE[3:5]}
matcher = dedupe.api.ActiveMatching(self.field_definition)
self.assertRaises(ValueError, matcher.markPairs, bad_training_pairs)
matcher.markPairs(good_training_pairs)
numpy.testing.assert_equal(matcher.training_data['label'],
[b'distinct', b'distinct', b'distinct',
b'match', b'match'])
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
matcher.markPairs({'match' : [], 'distinct' : []})
assert len(w) == 1
assert str(w[-1].message) == "Didn't return any labeled record pairs"
def test_initialize_fields(self):
self.assertRaises(TypeError, dedupe.api.ActiveMatching)
matcher = dedupe.api.ActiveMatching({},)
assert matcher.blocker is None
def test_add_training(self) :
from collections import OrderedDict
training_pairs = OrderedDict((('distinct', DATA_SAMPLE[0:3]),
('match', DATA_SAMPLE[3:5])))
matcher = dedupe.api.ActiveMatching(self.field_definition)
matcher._addTrainingData(training_pairs)
numpy.testing.assert_equal(matcher.training_data['label'],
[b'distinct', b'distinct', b'distinct',
b'match', b'match'])
matcher._addTrainingData(training_pairs)
numpy.testing.assert_equal(matcher.training_data['label'],
[b'distinct', b'distinct', b'distinct',
b'match', b'match']*2)
def test_check_record(self) :
matcher = dedupe.api.ActiveMatching(self.field_definition)
self.assertRaises(ValueError, matcher._checkRecordPairType, ())
self.assertRaises(ValueError, matcher._checkRecordPairType, (1,2))
self.assertRaises(ValueError, matcher._checkRecordPairType, (1,2,3))
self.assertRaises(ValueError, matcher._checkRecordPairType, ({},{}))
matcher._checkRecordPairType(({'name' : 'Frank', 'age' : '72'},
{'name' : 'Bob', 'age' : '27'}))
def test_initialize_fields(self) :
self.assertRaises(TypeError, dedupe.api.ActiveMatching)
matcher = dedupe.api.ActiveMatching({},)
assert matcher.blocker is None
self.activeLearner = training.ActiveLearning(self.data_sample,
self.data_model,
self.num_cores)
def _loadSampledRecords(self, data_sample):
"""Override to load blocking data from data_sample."""
class StaticDedupe(DedupeMatching, StaticMatching):
"""
Mixin Class for Static Deduplication
"""
class Dedupe(DedupeMatching, ActiveMatching):
"""
Mixin Class for Active Learning Deduplication
Public Methods
- sample
"""
canopies = True
def sample(self, data, sample_size=15000,
blocked_proportion=0.5):
'''Draw a sample of record pairs from the dataset
(a mix of random pairs & pairs of similar records)
and initialize active learning with this sample
Arguments: data -- Dictionary of records, where the keys are
record_ids and the values are dictionaries with the keys being
record_pair[0].keys() and record_pair[1].keys()
except AttributeError:
raise ValueError("A pair of record_pairs must be made up of two "
"dictionaries ")
self.data_model.check(record_pair[0])
self.data_model.check(record_pair[1])
class StaticDedupe(DedupeMatching, StaticMatching):
"""
Mixin Class for Static Deduplication
"""
class Dedupe(DedupeMatching, ActiveMatching):
"""
Mixin Class for Active Learning Deduplication
Public Methods
- sample
"""
canopies = True
def prepare_training(self,
data,
training_file=None,
sample_size=15000,
blocked_proportion=0.5,
original_length=None):
'''
Sets up the learner.
def _checkData(self, data):
if len(data) == 0:
raise ValueError(
'Dictionary of records is empty.')
self.data_model.check(next(iter(viewvalues(data))))
class StaticRecordLink(RecordLinkMatching, StaticMatching):
"""
Mixin Class for Static Record Linkage
"""
class RecordLink(RecordLinkMatching, ActiveMatching):
"""
Mixin Class for Active Learning Record Linkage
Public Methods
- sample
"""
canopies = False
def prepare_training(self,
data_1,
data_2,
training_file=None,
sample_size=15000,
blocked_proportion=0.5,
original_length_1=None,
original_length_2=None):