Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print('number of known duplicate pairs', len(duplicates_s))
if os.path.exists(settings_file):
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f, 1)
else:
fields = [{'field' : 'name', 'type': 'String'},
{'field' : 'name', 'type': 'Exact'},
{'field' : 'address', 'type': 'String'},
{'field' : 'cuisine', 'type': 'ShortString',
'has missing' : True},
{'field' : 'city', 'type' : 'ShortString'}
]
deduper = dedupe.Dedupe(fields, num_cores=5)
deduper.sample(data_d, 10000)
deduper.markPairs(training_pairs)
deduper.train()
with open(settings_file, 'wb') as f:
deduper.writeSettings(f)
alpha = deduper.threshold(data_d, 1)
# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold=alpha)
print('Evaluate Clustering')
confirm_dupes = set([])
for dupes, score in clustered_dupes:
def test_exact_comparator(self):
deduper = dedupe.Dedupe([{'field': 'name',
'type': 'Exact'}
])
record_pairs = (({'name': 'Shmoo'}, {'name': 'Shmee'}),
({'name': 'Shmoo'}, {'name': 'Shmoo'}))
numpy.testing.assert_array_almost_equal(deduper.data_model.distances(record_pairs),
numpy.array([[0.0],
[1.0]]),
3)
assert dedupe.predicates.commonIntegerPredicate(
field) == set(['123', '16'])
assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
assert dedupe.predicates.firstIntegerPredicate('foo') == ()
assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
assert dedupe.predicates.commonFourGram('12') == set([])
assert dedupe.predicates.sameFiveCharStartPredicate(
field) == ('12316',)
assert dedupe.predicates.sameSevenCharStartPredicate(
field) == ('12316th',)
assert dedupe.predicates.nearIntegersPredicate(
field) == set(['15', '17', '16', '122', '123', '124'])
assert dedupe.predicates.commonFourGram(field) == set(
['1231', '2316', '316t', '16th', '6ths', 'thst'])
assert dedupe.predicates.commonSixGram(field) == set(
['12316t', '2316th', '316ths', '16thst'])
assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
assert dedupe.predicates.initials(field, 7) == ('123 16t',)
assert dedupe.predicates.ngrams(
field, 3) == ['123', '23 ', '3 1', ' 16', '16t', '6th', 'th ', 'h s', ' st']
assert dedupe.predicates.commonTwoElementsPredicate(
(1, 2, 3)) == set(('1 2', '2 3'))
assert dedupe.predicates.commonTwoElementsPredicate((1,)) == set([])
assert dedupe.predicates.commonThreeElementsPredicate(
(1, 2, 3)) == set(('1 2 3',))
assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])
assert dedupe.predicates.fingerprint(
'time sandwich') == (u'sandwichtime',)
assert dedupe.predicates.existsPredicate('') == ('0',)
assert dedupe.predicates.existsPredicate(1) == ('1',)
assert dedupe.predicates.existsPredicate(0) == ('0',)
assert dedupe.predicates.sortedAcronym(field) == ('11s',)
assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
assert dedupe.predicates.firstTokenPredicate('') == ()
assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
assert dedupe.predicates.tokenFieldPredicate(
field) == set(['123', '16th', 'st'])
assert dedupe.predicates.commonIntegerPredicate(
field) == set(['123', '16'])
assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
assert dedupe.predicates.firstIntegerPredicate('foo') == ()
assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
assert dedupe.predicates.commonFourGram('12') == set([])
assert dedupe.predicates.sameFiveCharStartPredicate(
field) == ('12316',)
assert dedupe.predicates.sameSevenCharStartPredicate(
field) == ('12316th',)
assert dedupe.predicates.nearIntegersPredicate(
field) == set(['15', '17', '16', '122', '123', '124'])
assert dedupe.predicates.commonFourGram(field) == set(
['1231', '2316', '316t', '16th', '6ths', 'thst'])
assert dedupe.predicates.commonSixGram(field) == set(
['12316t', '2316th', '316ths', '16thst'])
assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
assert dedupe.predicates.initials(field, 7) == ('123 16t',)
def test_predicates_correctness(self):
field = '123 16th st'
assert dedupe.predicates.existsPredicate(field) == ('1',)
assert dedupe.predicates.existsPredicate('') == ('0',)
assert dedupe.predicates.existsPredicate(1) == ('1',)
assert dedupe.predicates.existsPredicate(0) == ('0',)
assert dedupe.predicates.sortedAcronym(field) == ('11s',)
assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
assert dedupe.predicates.firstTokenPredicate('') == ()
assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
assert dedupe.predicates.tokenFieldPredicate(field) == set(['123', '16th', 'st'])
assert dedupe.predicates.commonIntegerPredicate(field) == set(['123', '16'])
assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
assert dedupe.predicates.firstIntegerPredicate('foo') == ()
assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
assert dedupe.predicates.commonFourGram('12') == set([])
import dedupe
import unittest
import random
import numpy
import warnings
from collections import OrderedDict
DATA_SAMPLE = ((dedupe.core.frozendict({'age': '27', 'name': 'Kyle'}),
dedupe.core.frozendict({'age': '50', 'name': 'Bob'})),
(dedupe.core.frozendict({'age': '27', 'name': 'Kyle'}),
dedupe.core.frozendict({'age': '35', 'name': 'William'})),
(dedupe.core.frozendict({'age': '10', 'name': 'Sue'}),
dedupe.core.frozendict({'age': '35', 'name': 'William'})),
(dedupe.core.frozendict({'age': '27', 'name': 'Kyle'}),
dedupe.core.frozendict({'age': '20', 'name': 'Jimmy'})),
(dedupe.core.frozendict({'age': '75', 'name': 'Charlie'}),
dedupe.core.frozendict({'age': '21', 'name': 'Jimbo'})))
data_dict = OrderedDict(((0, {'name' : 'Bob', 'age' : '51'}),
(1, {'name' : 'Linda', 'age' : '50'}),
(2, {'name' : 'Gene', 'age' : '12'}),
(3, {'name' : 'Tina', 'age' : '15'}),
(4, {'name' : 'Bob B.', 'age' : '51'}),
(5, {'name' : 'bob belcher', 'age' : '51'}),
def canonicalImport(filename):
preProcess = exampleIO.preProcess
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
clean_row = [(k, preProcess(v)) for (k, v) in
viewitems(row)]
data_d[filename + str(i)] = dedupe.core.frozendict(clean_row)
return data_d, reader.fieldnames
def test_hash_is_order_insensitive(self):
frozendict = dedupe.core.frozendict
test_dict = {'smtp': 21, 'dict': 2628}
reverse_test_dict = {'dict': 2628, 'smtp': 21}
assert test_dict == reverse_test_dict
test_frozendict = frozendict(test_dict)
reverse_test_frozendict = frozendict(reverse_test_dict)
assert frozendict(test_dict) == frozendict(reverse_test_dict)
assert hash(test_frozendict) == hash(reverse_test_frozendict)
def canonicalImport(filename):
preProcess = exampleIO.preProcess
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for (i, row) in enumerate(reader):
clean_row = [(k, preProcess(v)) for (k, v) in
viewitems(row)]
data_d[i] = dedupe.core.frozendict(clean_row)
return data_d, reader.fieldnames
def test_uncovered_by(self):
before = {1: {1, 2, 3}, 2: {1, 2}, 3: {3}}
after = {1: {1, 2}, 2: {1, 2}}
before_copy = before.copy()
assert training.BranchBound.uncovered_by(before, set()) == before
assert training.BranchBound.uncovered_by(before, {3}) == after
assert before == before_copy