Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print('number of known duplicate pairs', len(duplicates_s))
if os.path.exists(settings_file):
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f, 1)
else:
fields = [{'field' : 'name', 'type': 'String'},
{'field' : 'name', 'type': 'Exact'},
{'field' : 'address', 'type': 'String'},
{'field' : 'cuisine', 'type': 'ShortString',
'has missing' : True},
{'field' : 'city', 'type' : 'ShortString'}
]
deduper = dedupe.Dedupe(fields, num_cores=5)
deduper.sample(data_d, 10000)
deduper.markPairs(training_pairs)
deduper.train()
with open(settings_file, 'wb') as f:
deduper.writeSettings(f)
alpha = deduper.threshold(data_d, 1)
# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold=alpha)
print('Evaluate Clustering')
confirm_dupes = set([])
for dupes, score in clustered_dupes:
def test_exact_comparator(self):
deduper = dedupe.Dedupe([{'field': 'name',
'type': 'Exact'}
])
record_pairs = (({'name': 'Shmoo'}, {'name': 'Shmee'}),
({'name': 'Shmoo'}, {'name': 'Shmoo'}))
numpy.testing.assert_array_almost_equal(deduper.data_model.distances(record_pairs),
numpy.array([[0.0],
[1.0]]),
3)
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
# The json file is of the form:
# {0: [[{field:val dict of record 1}, {field:val dict of record 2}], ...(more nonmatch pairs)]
# 1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
# }
if os.path.exists(r_training_file):
print 'reading labeled examples from ', r_training_file
deduper.train(data_sample, r_training_file)
# ## Active learning
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
# The json file is of the form:
# {0: [[{field:val dict of record 1}, {field:val dict of record 2}], ...(more nonmatch pairs)]
# 1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
# }
if os.path.exists(r_training_file):
print 'reading labeled examples from ', r_training_file
deduper.train(data_sample, r_training_file)
# ## Active learning
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# account
fields = {'name': {'type': 'String'},
'address': {'type': 'String', 'Has Missing' : True},
'city': {'type': 'String', 'Has Missing' : True},
'state': {'type': 'String'},
'zip': {'type': 'String', 'Has Missing' : True},
'person' : {'type' : 'Categorical',
'Categories' : [0, 1]},
'person-address' : {'type' : 'Interaction',
'Interaction Fields' : ['person', 'address']},
'name-address' : {'type' : 'Interaction',
'Interaction Fields' : ['name', 'address']}
}
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields, data_sample, num_processes=4)
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
#
# __Note:__ if you want to train from
# scratch, delete the training_file
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
deduper.readTraining(training_file)
# ## Active learning
print 'starting active labeling...'
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
import itertools
os.chdir('./examples/sqlite_example/')
settings_file = 'sqlite_example_settings.json'
t0 = time.time()
con = sqlite3.connect("illinois_contributions.db")
con.row_factory = sqlite3.Row
con.execute("ATTACH DATABASE 'blocking_map.db' AS bm")
cur = con.cursor()
if os.path.exists(settings_file):
print 'reading from ', settings_file
deduper = dedupe.Dedupe(settings_file)
else:
raise ValueError('Settings File Not Found')
# We grab all the block_keys with more than one record associated with
# it. These associated records will make up a block of records we will
# compare within.
blocking_key_sql = "SELECT key, COUNT(donor_id) AS num_candidates " \
"FROM bm.blocking_map GROUP BY key HAVING num_candidates > 1"
block_keys = (row['key'] for row in con.execute(blocking_key_sql))
# This grabs a block of records for comparison. We rely on the
# ordering of the donor_ids
donor_select = "SELECT donor_id, LOWER(city) AS city, " \
"LOWER(first_name) AS first_name, " \
"LOWER(last_name) AS last_name, " \
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
# Define the fields dedupe will pay attention to
fields = {'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
'patent_ct_name': {'type': 'Interaction',
'Interaction Fields': ['Name', 'patent_ct']
}
}
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
# The json file is of the form:
# {0: [[{field:val dict of record 1}, {field:val dict of record 2}], ...(more nonmatch pairs)]
# 1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
# }
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
deduper.train(data_sample, training_file)
# ## Active learning
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
)
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',