Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
t0 = time.time()
print('number of known duplicate pairs', len(duplicates_s))
if os.path.exists(settings_file):
with open(settings_file, 'rb') as f :
deduper = dedupe.StaticRecordLink(f)
else:
fields = [{'field': 'name', 'type': 'String'},
{'field': 'address', 'type': 'String'},
{'field': 'cuisine', 'type': 'String'},
{'field': 'city','type' : 'String'}
]
deduper = dedupe.RecordLink(fields)
deduper.sample(data_1, data_2, 10000)
deduper.markPairs(training_pairs)
deduper.train()
alpha = deduper.threshold(data_1, data_2)
with open(settings_file, 'wb') as f:
deduper.writeSettings(f, index=True)
# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_1, data_2, threshold=alpha)
print('Evaluate Clustering')
confirm_dupes = set(frozenset((data_1[pair[0]], data_2[pair[1]]))
def setUp(self) :
random.seed(123)
numpy.random.seed(456)
field_definition = [{'field' : 'name', 'type': 'String'},
{'field' :'age', 'type': 'String'}]
self.linker = dedupe.RecordLink(field_definition)
else:
# Define the fields the linker will pay attention to
#
# Notice how we are telling the linker to use a custom field comparator
# for the 'price' field.
fields = {
'title': {'type': 'String'},
'description': {'type': 'String',
'Has Missing' :True},
'price': {'type' : 'Custom',
'comparator' : comparePrice,
'Has Missing' : True}}
# Create a new linker object and pass our data model to it.
linker = dedupe.RecordLink(fields)
# To train the linker, we feed it a random sample of records.
linker.sample(data_1, data_2, 150000)
# If we have training data saved from a previous run of linker,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
linker.readTraining(training_file)
# ## Active learning
# Dedupe will find the next pair of records
# it is least certain about and ask you to label them as matches
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
nsf_linkage_dict = nsf_linkage[cname].to_dict('records')
nsf_linkage_dict = dict((i,d) for (i, d) in enumerate(nsf_linkage_dict))
return nih_linkage_dict, nsf_linkage_dict
if __name__ == '__main__':
params = Parameters()
n_sample = params.n_sample
print('prepare dataset...')
nih_linkage_dict, nsf_linkage_dict = prepare_linkage_dict()
fields = [{'field' : 'full_name', 'type': 'String', 'has missing' : True},
{'field' : 'insti_city', 'type': 'String', 'has missing' : True},
{'field' : 'insti_name', 'type': 'String', 'has missing' : True}]
linker = dedupe.RecordLink(fields)
linker.sample(nih_linkage_dict, nsf_linkage_dict, params.n_sample)
if os.path.exists(params.training_file):
linker = read_training_file(linker, params.training_file)
dedupe.consoleLabel(linker)
print('training linker...')
linker.train(ppc=None)
write_training_file(linker, params.training_file) # update training file
print('finding threshold...')
if params.threshold is None:
params.threshold = linker.threshold(nih_linkage_dict, nsf_linkage_dict,
recall_weight=2.0)
linked_records = linker.match(nih_linkage_dict, nsf_linkage_dict,
threshold=params.threshold)
yield record['description']
with open(fields_json_file_path) as fields_json_file:
fields = json.load(fields_json_file)
for field in fields:
validate_field(field)
if field['type'] == 'Text' and 'corpus' in field:
func_name = field['corpus'][1:-1]
field['corpus'] = locals()[func_name].__call__()
print('LEN RECORDS: ', len(left_records) / 2, len(right_records) / 2)
print('MIN SAMPLE', min(len(left_records) / 2, len(right_records) / 2))
print(fields)
linker = dedupe.RecordLink(fields)
# To train the linker, we feed it a sample of records.
linker.sample(
left_records,
right_records,
round(min(len(left_records) / 2, len(right_records) / 2))
)
print('getting examples')
# If we have training data saved from a previous run of linker,
# look for it an load it in.
examples = db.get_dataset(dataset)
if len(examples) > 0:
linker = update_linker(linker, examples)
def update(examples, linker=linker):
print(len(examples))
def executor(data1, data2):
input1 = {i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data1)}
input2 = {i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data2)}
fields = [{'field': field, 'type': 'String'} for field in fields1]
linker = dedupe.RecordLink(fields)
linker.sample(input1, input2, sample_size=1500)
while True:
labelling(linker)
try:
linker.train()
break
except: sys.stderr.write('\nYou need to do more training.\n')
threshold = linker.threshold(input1, input2, recall_weight=1)
pairs = linker.match(input1, input2, threshold)
matches = []
for pair in pairs:
matches.append((pair[0][0], pair[0][1], pair[1]))
return matches
return executor
grant_affils_df, grid_df = prepare_df()
grant_affils_dict = dataframe_to_dict(grant_affils_df)
grid_dict = dataframe_to_dict(grid_df)
pickle.dump(grant_affils_dict, open('grant_affils_dict.pickle', 'w'), protocol=2)
pickle.dump(grid_dict, open('grid_dict.pickle', 'w'), protocol=2)
else:
grant_affils_dict = pickle.load(open('grant_affils_dict.pickle', 'r'))
grid_dict = pickle.load(open('grid_dict.pickle', 'r'))
fields = [{'field' : 'insti_city', 'type': 'String', 'has missing' : True},
{'field' : 'insti_name', 'type': 'String', 'has missing' : True},
{'field' : 'insti_code', 'type': 'String', 'has missing' : True},
{'field' : 'insti_country', 'type': 'String', 'has missing': True},
]
linker = dedupe.RecordLink(fields, num_cores=args.cores)
linker.sample(grant_affils_dict, grid_dict, args.n)
if os.path.exists(args.training):
linker = read_training_file(linker, args.training)
if not args.skiplabel:
dedupe.consoleLabel(linker)
if args.verbose:
print('training linker...')
linker.train(ppc=None, index_predicates=not args.nopredicates)
write_training_file(linker, args.training) # update training file
if args.verbose:
print('finding threshold...')
if args.threshold == 0: