Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
assert dedupe.predicates.commonIntegerPredicate(
field) == set(['123', '16'])
assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
assert dedupe.predicates.firstIntegerPredicate('foo') == ()
assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
assert dedupe.predicates.commonFourGram('12') == set([])
assert dedupe.predicates.sameFiveCharStartPredicate(
field) == ('12316',)
assert dedupe.predicates.sameSevenCharStartPredicate(
field) == ('12316th',)
assert dedupe.predicates.nearIntegersPredicate(
field) == set(['15', '17', '16', '122', '123', '124'])
assert dedupe.predicates.commonFourGram(field) == set(
['1231', '2316', '316t', '16th', '6ths', 'thst'])
assert dedupe.predicates.commonSixGram(field) == set(
['12316t', '2316th', '316ths', '16thst'])
assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
assert dedupe.predicates.initials(field, 7) == ('123 16t',)
assert dedupe.predicates.ngrams(
field, 3) == ['123', '23 ', '3 1', ' 16', '16t', '6th', 'th ', 'h s', ' st']
assert dedupe.predicates.commonTwoElementsPredicate(
(1, 2, 3)) == set(('1 2', '2 3'))
assert dedupe.predicates.commonTwoElementsPredicate((1,)) == set([])
assert dedupe.predicates.commonThreeElementsPredicate(
(1, 2, 3)) == set(('1 2 3',))
assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])
assert dedupe.predicates.fingerprint(
'time sandwich') == (u'sandwichtime',)
assert dedupe.predicates.existsPredicate('') == ('0',)
assert dedupe.predicates.existsPredicate(1) == ('1',)
assert dedupe.predicates.existsPredicate(0) == ('0',)
assert dedupe.predicates.sortedAcronym(field) == ('11s',)
assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
assert dedupe.predicates.firstTokenPredicate('') == ()
assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
assert dedupe.predicates.tokenFieldPredicate(
field) == set(['123', '16th', 'st'])
assert dedupe.predicates.commonIntegerPredicate(
field) == set(['123', '16'])
assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
assert dedupe.predicates.firstIntegerPredicate('foo') == ()
assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
assert dedupe.predicates.commonFourGram('12') == set([])
assert dedupe.predicates.sameFiveCharStartPredicate(
field) == ('12316',)
assert dedupe.predicates.sameSevenCharStartPredicate(
field) == ('12316th',)
assert dedupe.predicates.nearIntegersPredicate(
field) == set(['15', '17', '16', '122', '123', '124'])
assert dedupe.predicates.commonFourGram(field) == set(
['1231', '2316', '316t', '16th', '6ths', 'thst'])
assert dedupe.predicates.commonSixGram(field) == set(
['12316t', '2316th', '316ths', '16thst'])
assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
assert dedupe.predicates.initials(field, 7) == ('123 16t',)
def test_predicates_correctness(self):
field = '123 16th st'
assert dedupe.predicates.existsPredicate(field) == ('1',)
assert dedupe.predicates.existsPredicate('') == ('0',)
assert dedupe.predicates.existsPredicate(1) == ('1',)
assert dedupe.predicates.existsPredicate(0) == ('0',)
assert dedupe.predicates.sortedAcronym(field) == ('11s',)
assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
assert dedupe.predicates.firstTokenPredicate('') == ()
assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
assert dedupe.predicates.tokenFieldPredicate(field) == set(['123', '16th', 'st'])
assert dedupe.predicates.commonIntegerPredicate(field) == set(['123', '16'])
assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
assert dedupe.predicates.firstIntegerPredicate('foo') == ()
assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
assert dedupe.predicates.commonFourGram('12') == set([])
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(training_file)
# Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize the blocker
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(ppc,
dupes,
deduper
)
# Occassionally the blocker fails to find useful values. If so,
# print the final values and exit.
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % ppc
from .base import FieldType, DerivedType
from dedupe import predicates
from categorical import CategoricalComparator
class CategoricalType(FieldType):
type = "Categorical"
_predicate_functions = [predicates.wholeFieldPredicate]
def _categories(self, definition):
try:
categories = definition["categories"]
except KeyError:
raise ValueError('No "categories" defined')
return categories
def __init__(self, definition):
super(CategoricalType, self).__init__(definition)
categories = self._categories(definition)
self.comparator = CategoricalComparator(categories)
@staticmethod
def comparator(price_1, price_2) :
if price_1 <= 0 :
return numpy.nan
elif price_2 <= 0 :
return numpy.nan
else :
return abs(numpy.log10(price_1) - numpy.log10(price_2))
class ShortStringType(FieldType) :
comparator = normalizedAffineGapDistance
type = "ShortString"
_predicate_functions = (dedupe.predicates.wholeFieldPredicate,
dedupe.predicates.tokenFieldPredicate,
dedupe.predicates.firstTokenPredicate,
dedupe.predicates.commonIntegerPredicate,
dedupe.predicates.nearIntegersPredicate,
dedupe.predicates.firstIntegerPredicate,
dedupe.predicates.sameThreeCharStartPredicate,
dedupe.predicates.sameFiveCharStartPredicate,
dedupe.predicates.sameSevenCharStartPredicate,
dedupe.predicates.commonFourGram,
dedupe.predicates.commonSixGram,
dedupe.predicates.commonTwoTokens,
dedupe.predicates.commonThreeTokens,
dedupe.predicates.fingerprint,
dedupe.predicates.oneGramFingerprint,
dedupe.predicates.twoGramFingerprint,
dedupe.predicates.sortedAcronym)
class StringType(ShortStringType) :
def __init__(self, definition) :
self.field = definition['field']
if 'variable name' in definition :
self.name = definition['variable name']
else :
self.name = "(%s: %s)" % (self.field, self.type)
self.predicates = [predicates.SimplePredicate(pred, self.field)
for pred in self._predicate_functions]
super(FieldType, self).__init__(definition)