Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file) as tf:
gazetteer.readTraining(tf)
# ## Active learning
# Dedupe will find the next pair of records
# it is least certain about and ask you to label them as matches
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print('starting active labeling...')
dedupe.consoleLabel(gazetteer)
gazetteer.train()
# When finished, save our training away to disk
with open(training_file, 'w') as tf:
gazetteer.writeTraining(tf)
# Make the canonical set
gazetteer.index(canonical)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
sys.setrecursionlimit(3000)
with open(settings_file, 'wb') as sf:
gazetteer.writeSettings(sf, index=True)
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
deduper.readTraining(training_file)
# ## Active learning
# Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
dedupe.consoleLabel(deduper)
deduper.train()
# When finished, save our training away to disk
deduper.writeTraining(training_file)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
deduper.writeSettings(settings_file)
# ## Blocking
print 'blocking...'
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(self.training_file):
gray("Reading labeled examples from ", self.training_file)
with open(self.training_file, "rb") as f:
deduper.prepare_training(data_d, f)
else:
deduper.prepare_training(data_d)
# ## Active learning
# Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
gray("Starting active labeling...")
dedupe.consoleLabel(deduper)
# Using the examples we just labeled, train the deduper and learn
# blocking predicates
gray("Training...")
deduper.train()
# When finished, save our training to disk
gray("Saving results to training file...")
with open(self.training_file, "w+") as tf:
deduper.writeTraining(tf)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
gray("Saving weights and predicates to settings file...")
with open(self.settings_file, "wb+") as sf:
from.
settings_file : str
A path to a settings file that will be loaded if it exists.
Returns
-------
dedupe.Dedupe
A trained dedupe model instance.
"""
# To train dedupe, we feed it a sample of records.
sample_num = math.floor(len(data) * sample_size)
deduper.sample(data, sample_num)
print('starting active labeling...')
dedupe.consoleLabel(deduper)
# Using the examples we just labeled, train the deduper and learn
# blocking predicates
deduper.train()
# When finished, save our training to disk
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
# Save our weights and predicates to disk.
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
return deduper
grid_dict = pickle.load(open('grid_dict.pickle', 'r'))
fields = [{'field' : 'insti_city', 'type': 'String', 'has missing' : True},
{'field' : 'insti_name', 'type': 'String', 'has missing' : True},
{'field' : 'insti_code', 'type': 'String', 'has missing' : True},
{'field' : 'insti_country', 'type': 'String', 'has missing': True},
]
linker = dedupe.RecordLink(fields, num_cores=args.cores)
linker.sample(grant_affils_dict, grid_dict, args.n)
if os.path.exists(args.training):
linker = read_training_file(linker, args.training)
if not args.skiplabel:
dedupe.consoleLabel(linker)
if args.verbose:
print('training linker...')
linker.train(ppc=None, index_predicates=not args.nopredicates)
write_training_file(linker, args.training) # update training file
if args.verbose:
print('finding threshold...')
if args.threshold == 0:
args.threshold = linker.threshold(grid_dict, grant_affils_dict,
recall_weight=0.5)
linked_records = linker.match(grid_dict, grant_affils_dict,
threshold=args.threshold)
# add grid_id to grant_affils_dict
# If we have training data saved from a previous run of linker,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
linker.readTraining(training_file)
# ## Active learning
# Dedupe will find the next pair of records
# it is least certain about and ask you to label them as matches
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print 'starting active labeling...'
dedupe.consoleLabel(linker)
linker.train()
# When finished, save our training away to disk
linker.writeTraining(training_file)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
linker.writeSettings(settings_file)
# ## Blocking
# ## Clustering
if __name__ == '__main__':
params = Parameters()
n_sample = params.n_sample
print('prepare dataset...')
nih_linkage_dict, nsf_linkage_dict = prepare_linkage_dict()
fields = [{'field' : 'full_name', 'type': 'String', 'has missing' : True},
{'field' : 'insti_city', 'type': 'String', 'has missing' : True},
{'field' : 'insti_name', 'type': 'String', 'has missing' : True}]
linker = dedupe.RecordLink(fields)
linker.sample(nih_linkage_dict, nsf_linkage_dict, params.n_sample)
if os.path.exists(params.training_file):
linker = read_training_file(linker, params.training_file)
dedupe.consoleLabel(linker)
print('training linker...')
linker.train(ppc=None)
write_training_file(linker, params.training_file) # update training file
print('finding threshold...')
if params.threshold is None:
params.threshold = linker.threshold(nih_linkage_dict, nsf_linkage_dict,
recall_weight=2.0)
linked_records = linker.match(nih_linkage_dict, nsf_linkage_dict,
threshold=params.threshold)
print('# of record linkage = %s' % len(linked_records))