How to use the dedupe.predicates.wholeSetPredicate function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / test_predicates.py View on Github external
def test_full_set(self):
        block_val = predicates.wholeSetPredicate(self.s1)
        self.assertEqual(block_val, (str(self.s1),))
github dedupeio / dedupe / tests / test_predicates.py View on Github external
def test_set(self):
        s1 = predicates.SimplePredicate(predicates.wholeSetPredicate,
                                        'foo')
        colors = set(['red', 'blue', 'green'])
        assert s1({'foo': colors}) == (str(colors),)
github dedupeio / dedupe / examples / patent_example / patent_example_twostage.py View on Github external
# ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
github dedupeio / dedupe / dedupe / variables / fieldclasses.py View on Github external
class LatLongType(FieldType) :
    type = "LatLong"

    _predicate_functions = [predicates.latLongGridPredicate]

    @staticmethod
    def comparator(field_1, field_2) :
        if field_1 == (0.0,0.0) or field_2 == (0.0,0.0) :
            return numpy.nan
        else :
            return haversine(field_1, field_2)

class SetType(FieldType) :
    type = "Set"

    _predicate_functions = (dedupe.predicates.wholeSetPredicate,
                            dedupe.predicates.commonSetElementPredicate,
                            dedupe.predicates.lastSetElementPredicate,
                            dedupe.predicates.commonTwoElementsPredicate,
                            dedupe.predicates.commonThreeElementsPredicate,
                            dedupe.predicates.firstSetElementPredicate)
    
    _canopy_thresholds = (0.2, 0.4, 0.6, 0.8)

    def __init__(self, definition) :
        super(SetType, self).__init__(definition)

        canopy_predicates = [predicates.TfidfPredicate(threshold, 
                                                       self.field)
                             for threshold in self._canopy_thresholds]

        self.predicates += canopy_predicates
github markhuberty / psClean / code / dedupe / archive / fi / patent_example_twostage_fi.py View on Github external
# ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
github markhuberty / psClean / code / dedupe / gb_weighted / patent_example_twostage_gb.py View on Github external
# ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
# ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
github dedupeio / dedupe / dedupe / variables / set.py View on Github external
from .base import FieldType
from dedupe import predicates
from simplecosine.cosine import CosineSetSimilarity


class SetType(FieldType):
    type = "Set"

    _predicate_functions = (predicates.wholeSetPredicate,
                            predicates.commonSetElementPredicate,
                            predicates.lastSetElementPredicate,
                            predicates.commonTwoElementsPredicate,
                            predicates.commonThreeElementsPredicate,
                            predicates.magnitudeOfCardinality,
                            predicates.firstSetElementPredicate)

    _index_predicates = (predicates.TfidfSetSearchPredicate,
                         predicates.TfidfSetCanopyPredicate)
    _index_thresholds = (0.2, 0.4, 0.6, 0.8)

    def __init__(self, definition):
        super(SetType, self).__init__(definition)

        if 'corpus' not in definition:
            definition['corpus'] = []
github markhuberty / psClean / code / dedupe / archive / es / patent_example_twostage_es.py View on Github external
# ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
# ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'