Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
import time
import sys
import pandas as pd
import math
import datetime
import numpy as np
sys.path.append('/home/markhuberty/Documents/dedupe/examples/patent_example')
## Load up local libraries
import patent_util
import AsciiDammit
# Finally load dedupe
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience. To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
import optparse
import time
import sys
import pandas as pd
import math
import datetime
sys.path.append('/home/markhuberty/Documents/dedupe/examples/patent_example')
## Load up local libraries
import patent_util
import AsciiDammit
# Finally load dedupe
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience. To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
)
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, 600000)
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
import os
import csv
import re
import collections
import logging
import optparse
import time
import sys
import pandas as pd
import patent_util
import math
import AsciiDammit
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine
def idf(i, j) :
i = int(i)
j = int(j)
max_i = max([i,j])
return math.log(len(data_d)/int(max_i))
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience. To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
# Reset the index so that it is sequential. Then
# store the new:old map
consolidated_input.reset_index(inplace=True)
index_map = consolidated_input['index'].to_dict()
data_d = patent_util.readDataFrame(consolidated_input)
del consolidated_input
input_df.set_index(cluster_name, inplace=True)
## Build the comparators
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)
# ## Training
if os.path.exists(r_settings_file):
print 'reading from', r_settings_file
deduper = dedupe.Dedupe(r_settings_file)
else:
# To train dedupe, we feed it a random sample of records.
data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
# Define the fields dedupe will pay attention to
fields = {
'Name': {'type': 'String', 'Has Missing':True},
'LatLong': {'type': 'LatLong', 'Has Missing':True},
'Class': {'type': 'Custom', 'comparator':class_comparator},
'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
# 'Class_Count': {'type': 'Custom', 'comparator': idf},
import collections
import logging
import optparse
import time
import sys
import pandas as pd
import math
import datetime
import numpy as np
import patent_util
import AsciiDammit
# Finally load dedupe
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine
def integer_diff(a, b):
r = 1.0 / (abs(a-b) + 1)
return r
# Finally load dedupe
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience. To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
import optparse
import time
import sys
import pandas as pd
import math
import datetime
sys.path.append('/home/markhuberty/Documents/dedupe/examples/patent_example')
## Load up local libraries
import patent_util
import AsciiDammit
# Finally load dedupe
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience. To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG