Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_empty_set(self):
block_val = predicates.commonSetElementPredicate(set())
self.assertEqual(block_val, tuple())
def test_long_set(self):
block_val = predicates.commonSetElementPredicate(self.s1)
self.assertEqual(set(block_val), set(('blue', 'green', 'red')))
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(training_file)
# Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize the blocker
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(ppc,
dupes,
deduper
)
# Occassionally the blocker fails to find useful values. If so,
# print the final values and exit.
if not blocker:
print 'No valid blocking settings found'
from .base import FieldType
from dedupe import predicates
from simplecosine.cosine import CosineSetSimilarity
class SetType(FieldType):
type = "Set"
_predicate_functions = (predicates.wholeSetPredicate,
predicates.commonSetElementPredicate,
predicates.lastSetElementPredicate,
predicates.commonTwoElementsPredicate,
predicates.commonThreeElementsPredicate,
predicates.magnitudeOfCardinality,
predicates.firstSetElementPredicate)
_index_predicates = (predicates.TfidfSetSearchPredicate,
predicates.TfidfSetCanopyPredicate)
_index_thresholds = (0.2, 0.4, 0.6, 0.8)
def __init__(self, definition):
super(SetType, self).__init__(definition)
if 'corpus' not in definition:
definition['corpus'] = []
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
deduper.train(data_sample, dedupe.training.consoleLabel)
# When finished, save our training away to disk
deduper.writeTraining(r_training_file)
# ## Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
dedupe.predicates.commonSetElementPredicate),
'LatLong' : (dedupe.predicates.latLongGridPredicate,)
}
)
time_start = time.time()
print 'blocking...'
# Initialize our blocker, which determines our field weights and blocking
# predicates based on our training data
#blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
r_uncovered_dupes,
deduper
)
if not blocker:
print 'No valid blocking settings found'
print 'Starting ppc value: %s' % r_ppc