Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data_d, header = canonicalImport(raw_data)
training_pairs = dedupe.trainingDataDedupe(data_d,
'unique_id',
5000)
duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])
t0 = time.time()
print('number of known duplicate pairs', len(duplicates_s))
if os.path.exists(settings_file):
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f, 1)
else:
fields = [{'field' : 'name', 'type': 'String'},
{'field' : 'name', 'type': 'Exact'},
{'field' : 'address', 'type': 'String'},
{'field' : 'cuisine', 'type': 'ShortString',
'has missing' : True},
{'field' : 'city', 'type' : 'ShortString'}
]
deduper = dedupe.Dedupe(fields, num_cores=5)
deduper.sample(data_d, 10000)
deduper.markPairs(training_pairs)
deduper.train()
with open(settings_file, 'wb') as f:
deduper.writeSettings(f)
duplicates = set()
for _, pair in groupby(sorted(data_d.items(),
key=lambda x: x[1]['unique_id']),
key=lambda x: x[1]['unique_id']):
pair = list(pair)
if len(pair) == 2:
a, b = pair
duplicates.add(frozenset((a[0], b[0])))
t0 = time.time()
print('number of known duplicate pairs', len(duplicates))
if os.path.exists(settings_file):
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f, 1)
else:
fields = [{'field': 'name', 'type': 'String'},
{'field': 'name', 'type': 'Exact'},
{'field': 'address', 'type': 'String'},
{'field': 'cuisine', 'type': 'ShortString',
'has missing': True},
{'field': 'city', 'type': 'ShortString'}
]
deduper = dedupe.Dedupe(fields, num_cores=5)
deduper.prepare_training(data_d, sample_size=10000)
deduper.markPairs(training_pairs)
deduper.train(index_predicates=False)
with open(settings_file, 'wb') as f:
deduper.writeSettings(f)
def find_duplicates(self, train, push, label):
"""
Find duplicates and print them via stdout
"""
# data_d = self.read_data()
data_d = self.read_data(label, invalid=False)
# pprint.pprint(data_d)
if train:
deduper = self.train(data_d)
else:
if os.path.exists(self.settings_file):
print("reading from", self.settings_file)
with open(self.settings_file, "rb") as f:
deduper = dedupe.StaticDedupe(f)
else:
red("Error: settings file does not exist, stoping")
sys.exit(1)
cyan("Finding the threshold for data...")
threshold = deduper.threshold(data_d, recall_weight=1)
cyan("Clustering...")
clustered_dupes = deduper.match(data_d, threshold)
cyan("Number of duplicate sets: " + str(len(clustered_dupes)))
for aset in clustered_dupes:
yellow("Found a duplicated pair...")
ids, values = aset
primary_issue = None # reflects the primary ticket in a aset of
# duplicates all the duplicates should point
NOTE: should be called from the RVD respository directory.
:param flaw, Flaw
:return list
"""
data_d = self.read_data(None, invalid=False) # data dict
# pprint.pprint(data_d)
# Append the flaw to the data dictonary with the ID 0
data_d[0] = flaw.document_duplicates()
# pprint.pprint(data_d)
if os.path.exists(self.settings_file):
print("reading from", self.settings_file)
with open(self.settings_file, "rb") as f:
deduper = dedupe.StaticDedupe(f)
else:
red("Error: settings file does not exist, stoping")
sys.exit(1)
cyan("Finding the threshold for data...")
threshold = deduper.threshold(data_d, recall_weight=1)
cyan("Clustering...")
clustered_dupes = deduper.match(data_d, threshold)
# pprint.pprint(clustered_dupes) # debug purposes
# Â If ID 0 (corresponds with flaw passed as arg) is in there, is_duplicate
for set in clustered_dupes:
ids, values = set
if 0 in ids:
return list(ids)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d
print 'importing data ...'
data_d = readData(input_file)
# ## Training
if os.path.exists(settings_file):
print 'reading from', settings_file
deduper = dedupe.StaticDedupe(settings_file)
else:
# Define the fields dedupe will pay attention to
#
# Notice how we are telling dedupe to use a custom field comparator
# for the 'Zip' field.
fields = {
'Site name': {'type': 'String'},
'Address': {'type': 'String'},
'Zip': {'type': 'Custom',
'comparator' : sameOrNotComparator,
'Has Missing' : True},
'Phone': {'type': 'String', 'Has Missing' : True},
}
# Create a new deduper object and pass our data model to it.
Returns
-------
dedupe.Dedupe
A dedupe model instance.
"""
# Define the fields dedupe will pay attention to
fields = []
select_fields(fields, field_properties)
if update_model == False:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
print('reading from', settings_file)
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f)
#Create a new deduper object and pass our data model to it.
else:
# Initialise dedupe
deduper = dedupe.Dedupe(fields)
# Launch active learning
deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
else:
# ## Training
# Initialise dedupe
deduper = dedupe.Dedupe(fields)
# Import existing model
print('reading labeled examples from ', training_file)