Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return numerator/(norm_1 * norm_2)
else :
return numpy.nan
def __getstate__(self):
result = self.__dict__.copy()
result['vectors'] = {}
return result
class CosineTextSimilarity(CosineSimilarity) :
def _list(self, document) :
return document.split()
class CosineSetSimilarity(CosineSimilarity) :
def _list(self, document) :
return document
numerator = 0.0
for word in set(vector_1) & set(vector_2) :
numerator += vector_1[word] * vector_2[word]
return numerator/(norm_1 * norm_2)
else :
return numpy.nan
def __getstate__(self):
result = self.__dict__.copy()
result['vectors'] = {}
return result
class CosineTextSimilarity(CosineSimilarity) :
def _list(self, document) :
return document.split()
class CosineSetSimilarity(CosineSimilarity) :
def _list(self, document) :
return document
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},