How to use the dedupe.predicates function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / test_dedupe.py View on Github external
assert dedupe.predicates.commonIntegerPredicate(
            field) == set(['123', '16'])
        assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
        assert dedupe.predicates.firstIntegerPredicate('foo') == ()
        assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
        assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
        assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
        assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
        assert dedupe.predicates.commonFourGram('12') == set([])
        assert dedupe.predicates.sameFiveCharStartPredicate(
            field) == ('12316',)
        assert dedupe.predicates.sameSevenCharStartPredicate(
            field) == ('12316th',)
        assert dedupe.predicates.nearIntegersPredicate(
            field) == set(['15', '17', '16', '122', '123', '124'])
        assert dedupe.predicates.commonFourGram(field) == set(
            ['1231', '2316', '316t', '16th', '6ths', 'thst'])
        assert dedupe.predicates.commonSixGram(field) == set(
            ['12316t', '2316th', '316ths', '16thst'])
        assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
        assert dedupe.predicates.initials(field, 7) == ('123 16t',)
        assert dedupe.predicates.ngrams(
            field, 3) == ['123', '23 ', '3 1', ' 16', '16t', '6th', 'th ', 'h s', ' st']
        assert dedupe.predicates.commonTwoElementsPredicate(
            (1, 2, 3)) == set(('1 2', '2 3'))
        assert dedupe.predicates.commonTwoElementsPredicate((1,)) == set([])
        assert dedupe.predicates.commonThreeElementsPredicate(
            (1, 2, 3)) == set(('1 2 3',))
        assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])

        assert dedupe.predicates.fingerprint(
            'time sandwich') == (u'sandwichtime',)
github dedupeio / dedupe / tests / test_dedupe.py View on Github external
assert dedupe.predicates.existsPredicate('') == ('0',)
        assert dedupe.predicates.existsPredicate(1) == ('1',)
        assert dedupe.predicates.existsPredicate(0) == ('0',)
        assert dedupe.predicates.sortedAcronym(field) == ('11s',)
        assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
        assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
        assert dedupe.predicates.firstTokenPredicate('') == ()
        assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
        assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
        assert dedupe.predicates.tokenFieldPredicate(
            field) == set(['123', '16th', 'st'])
        assert dedupe.predicates.commonIntegerPredicate(
            field) == set(['123', '16'])
        assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
        assert dedupe.predicates.firstIntegerPredicate('foo') == ()
        assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
        assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
        assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
        assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
        assert dedupe.predicates.commonFourGram('12') == set([])
        assert dedupe.predicates.sameFiveCharStartPredicate(
            field) == ('12316',)
        assert dedupe.predicates.sameSevenCharStartPredicate(
            field) == ('12316th',)
        assert dedupe.predicates.nearIntegersPredicate(
            field) == set(['15', '17', '16', '122', '123', '124'])
        assert dedupe.predicates.commonFourGram(field) == set(
            ['1231', '2316', '316t', '16th', '6ths', 'thst'])
        assert dedupe.predicates.commonSixGram(field) == set(
            ['12316t', '2316th', '316ths', '16thst'])
        assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
        assert dedupe.predicates.initials(field, 7) == ('123 16t',)
github dedupeio / dedupe / tests / test_dedupe.py View on Github external
def test_predicates_correctness(self):
    field = '123 16th st'
    assert dedupe.predicates.existsPredicate(field) == ('1',)
    assert dedupe.predicates.existsPredicate('') == ('0',)
    assert dedupe.predicates.existsPredicate(1) == ('1',)
    assert dedupe.predicates.existsPredicate(0) == ('0',)
    assert dedupe.predicates.sortedAcronym(field) == ('11s',)
    assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
    assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
    assert dedupe.predicates.firstTokenPredicate('') == ()
    assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
    assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
    assert dedupe.predicates.tokenFieldPredicate(field) == set(['123', '16th', 'st'])
    assert dedupe.predicates.commonIntegerPredicate(field) == set(['123', '16'])
    assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
    assert dedupe.predicates.firstIntegerPredicate('foo') == ()
    assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
    assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
    assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
    assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
    assert dedupe.predicates.commonFourGram('12') == set([])
github markhuberty / psClean / code / amadeus / amadeus_dedupe.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.

    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'
    deduper.train(data_sample, dedupe.training.consoleLabel)

    # When finished, save our training away to disk
    deduper.writeTraining(training_file)

# Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                         dedupe.predicates.commonSetElementPredicate),
                              'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                              }
                             )
time_start = time.time()
print 'blocking...'

# Initialize the blocker
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(ppc,
                                                                    dupes,
                                                                    deduper
                                                                    )

# Occassionally the blocker fails to find useful values. If so,
# print the final values and exit.
if not blocker:
    print 'No valid blocking settings found'
    print 'Starting ppc value: %s' % ppc
github dedupeio / dedupe / dedupe / variables / categorical_type.py View on Github external
from .base import FieldType, DerivedType
from dedupe import predicates
from categorical import CategoricalComparator


class CategoricalType(FieldType):
    type = "Categorical"
    _predicate_functions = [predicates.wholeFieldPredicate]

    def _categories(self, definition):
        try:
            categories = definition["categories"]
        except KeyError:
            raise ValueError('No "categories" defined')

        return categories

    def __init__(self, definition):

        super(CategoricalType, self).__init__(definition)

        categories = self._categories(definition)

        self.comparator = CategoricalComparator(categories)
github dedupeio / dedupe / dedupe / variables / fieldclasses.py View on Github external
    @staticmethod
    def comparator(price_1, price_2) :
        if price_1 <= 0 :
            return numpy.nan
        elif price_2 <= 0 :
            return numpy.nan
        else :
            return abs(numpy.log10(price_1) - numpy.log10(price_2))

class ShortStringType(FieldType) :
    comparator = normalizedAffineGapDistance
    type = "ShortString"

    _predicate_functions = (dedupe.predicates.wholeFieldPredicate,
                            dedupe.predicates.tokenFieldPredicate,
                            dedupe.predicates.firstTokenPredicate,
                            dedupe.predicates.commonIntegerPredicate,
                            dedupe.predicates.nearIntegersPredicate,
                            dedupe.predicates.firstIntegerPredicate,
                            dedupe.predicates.sameThreeCharStartPredicate,
                            dedupe.predicates.sameFiveCharStartPredicate,
                            dedupe.predicates.sameSevenCharStartPredicate,
                            dedupe.predicates.commonFourGram,
                            dedupe.predicates.commonSixGram,
                            dedupe.predicates.commonTwoTokens,
                            dedupe.predicates.commonThreeTokens,
                            dedupe.predicates.fingerprint,
                            dedupe.predicates.oneGramFingerprint,
                            dedupe.predicates.twoGramFingerprint,
                            dedupe.predicates.sortedAcronym)

class StringType(ShortStringType) :
github dedupeio / dedupe / dedupe / variables / fieldclasses.py View on Github external
def __init__(self, definition) :
        self.field = definition['field']

        if 'variable name' in definition :
            self.name = definition['variable name'] 
        else :
            self.name = "(%s: %s)" % (self.field, self.type)

        self.predicates = [predicates.SimplePredicate(pred, self.field) 
                           for pred in self._predicate_functions]

        super(FieldType, self).__init__(definition)