Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_precise_latlong(self):
block_val = predicates.latLongGridPredicate(self.latlong1)
assert block_val == (u'[42.5, -5.0]',)
block_val = predicates.latLongGridPredicate((0, 0))
assert block_val == ()
def test_precise_latlong(self):
block_val = predicates.latLongGridPredicate(self.latlong1)
assert block_val == (u'[42.5, -5.0]',)
block_val = predicates.latLongGridPredicate((0, 0))
assert block_val == ()
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
from math import sqrt
from .base import FieldType
from dedupe import predicates
from haversine import haversine
class LatLongType(FieldType):
type = "LatLong"
_predicate_functions = [predicates.latLongGridPredicate]
@staticmethod
def comparator(x, y):
return sqrt(haversine(x, y))
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
class TextType(StringType) :
type = "Text"
def __init__(self, definition) :
super(TextType, self).__init__(definition)
if 'corpus' not in definition :
definition['corpus'] = []
self.comparator = dedupe.distance.CosineTextSimilarity(definition['corpus'])
class LatLongType(FieldType) :
type = "LatLong"
_predicate_functions = [predicates.latLongGridPredicate]
@staticmethod
def comparator(field_1, field_2) :
if field_1 == (0.0,0.0) or field_2 == (0.0,0.0) :
return numpy.nan
else :
return haversine(field_1, field_2)
class SetType(FieldType) :
type = "Set"
_predicate_functions = (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate,
dedupe.predicates.lastSetElementPredicate,
dedupe.predicates.commonTwoElementsPredicate,
dedupe.predicates.commonThreeElementsPredicate,
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes