How to use the dedupe.RecordLink function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / canonical_matching.py View on Github external
t0 = time.time()

print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f :
        deduper = dedupe.StaticRecordLink(f)
else:
    fields = [{'field': 'name', 'type': 'String'},
              {'field': 'address', 'type': 'String'},
              {'field': 'cuisine', 'type': 'String'},
              {'field': 'city','type' : 'String'}
              ]

    deduper = dedupe.RecordLink(fields)
    deduper.sample(data_1, data_2, 10000) 
    deduper.markPairs(training_pairs)
    deduper.train()

alpha = deduper.threshold(data_1, data_2)

with open(settings_file, 'wb') as f:
    deduper.writeSettings(f, index=True)


# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_1, data_2, threshold=alpha)

print('Evaluate Clustering')
confirm_dupes = set(frozenset((data_1[pair[0]], data_2[pair[1]]))
github dedupeio / dedupe / tests / test_api.py View on Github external
def setUp(self) : 
    random.seed(123)
    numpy.random.seed(456)

    field_definition = [{'field' : 'name', 'type': 'String'}, 
                        {'field' :'age', 'type': 'String'}]
    self.linker = dedupe.RecordLink(field_definition)
github dedupeio / dedupe / examples / record_linkage_example / record_linkage_example.py View on Github external
else:
    # Define the fields the linker will pay attention to
    #
    # Notice how we are telling the linker to use a custom field comparator
    # for the 'price' field. 
    fields = {
        'title': {'type': 'String'},
        'description': {'type': 'String',
                        'Has Missing' :True},
        'price': {'type' : 'Custom',
                  'comparator' : comparePrice,
                  'Has Missing' : True}}

    # Create a new linker object and pass our data model to it.
    linker = dedupe.RecordLink(fields)
    # To train the linker, we feed it a random sample of records.
    linker.sample(data_1, data_2, 150000)

    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        linker.readTraining(training_file)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
github titipata / grant_database / dedupe / link_grant.py View on Github external
nsf_linkage_dict = nsf_linkage[cname].to_dict('records')
    nsf_linkage_dict = dict((i,d) for (i, d) in enumerate(nsf_linkage_dict))
    return nih_linkage_dict, nsf_linkage_dict


if __name__ == '__main__':
    params = Parameters()
    n_sample = params.n_sample

    print('prepare dataset...')
    nih_linkage_dict, nsf_linkage_dict = prepare_linkage_dict()

    fields = [{'field' : 'full_name', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_city', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_name', 'type': 'String', 'has missing' : True}]
    linker = dedupe.RecordLink(fields)
    linker.sample(nih_linkage_dict, nsf_linkage_dict, params.n_sample)
    if os.path.exists(params.training_file):
        linker = read_training_file(linker, params.training_file)

    dedupe.consoleLabel(linker)
    print('training linker...')
    linker.train(ppc=None)
    write_training_file(linker, params.training_file) # update training file

    print('finding threshold...')
    if params.threshold is None:
        params.threshold = linker.threshold(nih_linkage_dict, nsf_linkage_dict,
                                            recall_weight=2.0)

    linked_records = linker.match(nih_linkage_dict, nsf_linkage_dict,
                                  threshold=params.threshold)
github explosion / prodigy-recipes / contrib / dedupe / link_records.py View on Github external
yield record['description']

    with open(fields_json_file_path) as fields_json_file:
        fields = json.load(fields_json_file)

    for field in fields:
        validate_field(field)
        if field['type'] == 'Text' and 'corpus' in field:            
            func_name = field['corpus'][1:-1]
            field['corpus'] = locals()[func_name].__call__()

    print('LEN RECORDS: ', len(left_records) / 2, len(right_records) / 2)
    print('MIN SAMPLE', min(len(left_records) / 2, len(right_records) / 2))
    print(fields)

    linker = dedupe.RecordLink(fields)
    # To train the linker, we feed it a sample of records.
    linker.sample(
        left_records,
        right_records,
        round(min(len(left_records) / 2, len(right_records) / 2))
    )

    print('getting examples')
    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    examples = db.get_dataset(dataset)
    if len(examples) > 0:
        linker = update_linker(linker, examples)

    def update(examples, linker=linker):
        print(len(examples))
github maxharlow / csvmatch / fuzzybilenko.py View on Github external
def executor(data1, data2):
        input1 = {i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data1)}
        input2 = {i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data2)}
        fields = [{'field': field, 'type': 'String'} for field in fields1]
        linker = dedupe.RecordLink(fields)
        linker.sample(input1, input2, sample_size=1500)
        while True:
            labelling(linker)
            try:
                linker.train()
                break
            except: sys.stderr.write('\nYou need to do more training.\n')
        threshold = linker.threshold(input1, input2, recall_weight=1)
        pairs = linker.match(input1, input2, threshold)
        matches = []
        for pair in pairs:
            matches.append((pair[0][0], pair[0][1], pair[1]))
        return matches
    return executor
github titipata / grant_database / dedupe / link_affiliation.py View on Github external
grant_affils_df, grid_df = prepare_df()
        grant_affils_dict = dataframe_to_dict(grant_affils_df)
        grid_dict = dataframe_to_dict(grid_df)
        pickle.dump(grant_affils_dict, open('grant_affils_dict.pickle', 'w'), protocol=2)
        pickle.dump(grid_dict, open('grid_dict.pickle', 'w'), protocol=2)
    else:
        grant_affils_dict = pickle.load(open('grant_affils_dict.pickle', 'r'))
        grid_dict = pickle.load(open('grid_dict.pickle', 'r'))

    fields = [{'field' : 'insti_city', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_name', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_code', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_country', 'type': 'String', 'has missing': True},
              ]

    linker = dedupe.RecordLink(fields, num_cores=args.cores)

    linker.sample(grant_affils_dict, grid_dict, args.n)
    if os.path.exists(args.training):
        linker = read_training_file(linker, args.training)

    if not args.skiplabel:
        dedupe.consoleLabel(linker)

    if args.verbose:
        print('training linker...')
    linker.train(ppc=None, index_predicates=not args.nopredicates)
    write_training_file(linker, args.training) # update training file

    if args.verbose:
        print('finding threshold...')
    if args.threshold == 0: