Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 3 * input_df.shape[0])
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
# Import the data
print 'importing data ...'
input_df = pd.read_csv(input_file)
input_df.Class.fillna('', inplace=True)
input_df.Coauthor.fillna('', inplace=True)
input_df.Lat.fillna('0.0', inplace=True)
input_df.Lng.fillna('0.0', inplace=True)
input_df.Name.fillna('', inplace=True)
# Read the data into a format dedupe can use
data_d = patent_util.readDataFrame(input_df)
# Build the comparators for class and coauthor
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# Training
if os.path.exists(settings_file):
print 'reading from', settings_file
deduper = dedupe.Dedupe(settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
# Define the fields dedupe will pay attention to
fields = {'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
}
#input_file.set_index(cluster_name)
consolidated_input = patent_util.consolidate(input_df,
cluster_name,
cluster_agg_dict
)
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
def __init__(self, definition) :
super(TextType, self).__init__(definition)
if 'corpus' not in definition :
definition['corpus'] = []
self.comparator = dedupe.distance.CosineTextSimilarity(definition['corpus'])
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 3 * input_df.shape[0])
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,