Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data_d = patent_util.readDataFrame(input_df)
# Build the comparators for class and coauthor
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# Training
if os.path.exists(settings_file):
print 'reading from', settings_file
deduper = dedupe.Dedupe(settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
# Define the fields dedupe will pay attention to
fields = {'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
'patent_ct_name': {'type': 'Interaction',
'Interaction Fields': ['Name', 'patent_ct']
}
}
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
'patent_ct_name': {'type': 'Interaction',
'Interaction Fields': ['Name', 'patent_ct']
}
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
# 'Class_Count_Class': {'type': 'Interaction',
# 'Interaction Fields': ['Class_Count', 'Class']
# },
# 'Coauthor_Count_Coauthor': {'type': 'Interaction',
# 'Interaction Fields': ['Coauthor_Count', 'Coauthor']
# }
}