How to use the dedupe.dataSample function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github markhuberty / psClean / code / dedupe / patstat_dedupe.py View on Github external
data_d = patent_util.readDataFrame(input_df)

# Build the comparators for class and coauthor
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# Training
if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.Dedupe(settings_file)

else:
    # To train dedupe, we feed it a random sample of records.
    data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
    # Define the fields dedupe will pay attention to
    fields = {'Name': {'type': 'String', 'Has Missing':True},
              'LatLong': {'type': 'LatLong', 'Has Missing':True},
              'Class': {'type': 'Custom', 'comparator':class_comparator},
              'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
              'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
              'patent_ct_name': {'type': 'Interaction',
                                 'Interaction Fields': ['Name', 'patent_ct']
                                 }
              }

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
github markhuberty / psClean / code / dedupe / gb_weighted / patent_example_twostage_gb.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
            'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
            'patent_ct_name': {'type': 'Interaction',
                               'Interaction Fields': ['Name', 'patent_ct']
                               }
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
github dedupeio / dedupe / examples / patent_example / patent_example_twostage.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }
github markhuberty / psClean / code / dedupe / archive / fi / patent_example_twostage_fi.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }
github markhuberty / psClean / code / dedupe / archive / be / patent_example_twostage_be.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }
github markhuberty / psClean / code / dedupe / archive / es / patent_example_twostage_es.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }