Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for row in data.index:
data[row] = data[row].lower()
out = pd.DataFrame(columns=["Dup_ID1", "Dup_ID2", "Dup_1", "Dup_2"])
if metric == "DL": # Damerau Levenshtein Distance
res = {_d: [] for _d in data}
for _d in res.keys():
for row in data.index:
if _d != data[row] \
and jf.damerau_levenshtein_distance(_d, data[row]) < \
((len(_d) + len(data[row])/2)*threshold):
res[_d].append(data[row])
out.loc[len(out)] = (
_d.split("*")[-1], row, _d, data[row])
elif metric == "LM": # Levenshtein Distance
res = {_d: [] for _d in data}
for _d in res.keys():
for row in data.index:
if _d != data[row] \
def match_name_score(self, query):
corpus = [self.table, self.column] + self.column.split(".")
score = 10e10
for word in corpus:
distance = jellyfish.damerau_levenshtein_distance(word, query)
if query in word:
distance /= 2
score = min(score, distance)
return score
Available algorithms:
* levenshtein
* damerau-levenshtein (DEFAULT)
* hamming
* jaro
* jaro-winkler
Return values:
"levenshtein", "damerau-levenshtein" and "hamming" return integers
"jaro" and "jaro-winkler" return floats in the range of 0.0 (completely
different) to 1.0 (identical strings).
'''
algos = {
"levenshtein":levenshtein_distance,
"damerau-levenshtein":damerau_levenshtein_distance,
"hamming":hamming_distance,
"jaro":jaro_similarity,
"jaro-winkler":jaro_winkler_similarity
}
if not method in list(algos.keys()):
raise ValueError("Unsupported algorithm type: %s" % method)
if str1 is None or str2 is None or not isinstance(str1, str) or not isinstance(str2, str):
raise TypeError("Arguments must be strings.")
distance_function = algos[method]
# All the jellyfish distance functions expect unicode, which is the default
# for Python3. If we're running in Python2, we need to convert them.
python_version = sys.version_info
def are_close_enough(this, that):
return jellyfish.damerau_levenshtein_distance(this, that) <= 2
def similarity(self, first, second):
"""Returns string similarity in range 0 - 100%."""
try:
try:
distance = damerau_levenshtein_distance(first, second)
except ValueError:
# Needed on Python 2 only (actually jellyfish < 0.7.2)
distance = py_damerau_levenshtein_distance(first, second)
return int(
100 * (1.0 - (float(distance) / max(len(first), len(second), 1)))
)
except MemoryError:
# Too long string, mark them as not much similar
return 50
def damerau_levenshtein_apply(x):
try:
return 1 - jellyfish.damerau_levenshtein_distance(x[0], x[1]) \
/ np.max([len(x[0]), len(x[1])])
except Exception as err:
if pandas.isnull(x[0]) or pandas.isnull(x[1]):
return np.nan
else:
raise err