Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def jaro_dist(scan_res, desired):
scan_line = get_file_as_string(scan_res)
desired_line = get_file_as_string(desired)
return jellyfish.jaro_winkler(scan_line, desired_line, long_tolerance=True)
# verify that the category has two elements, if not, then just
# return _ for the first category. Need this because fuzzy_in_set uses the
# same method
table_name = '_'
category = None
if isinstance(cat, tuple):
table_name = cat[0]
category = cat[1]
else:
category = cat
scores.append(
(
table_name,
category,
jellyfish.jaro_winkler(
str(s.encode('ascii', 'replace').lower()),
str(category.encode('ascii', 'replace').lower())
)
)
)
# sort first by the ones
# print('all scores for {} are {}'.format(s, scores))
scores.sort()
scores = sorted(scores, key=cmp_to_key(sort_scores))
# take the top n number of matches
scores = scores[:top_n]
# convert to hundreds
scores = [(score[0], score[1], int(score[2] * 100)) for score in scores]
# print('ending all categories match of {} with scores {}'.format(s, scores))
def jaro(self, cand1, cand2):
return jellyfish.jaro_winkler(cand1, cand2 )
def jaro_winkler_similarity(s, t):
""" Jaro-Winkler Similarity """
jw_sim = jellyfish.jaro_winkler(s, t)
return jw_sim
await self.synchronise()
s = normalise(search_terms)
tokens_to_defns = bucketise(
(
(normalise(i.name), (i.source, i.id))
for i in self.catalogue.__root__
if self.config.game_flavour in i.compatibility
),
key=lambda v: v[0],
)
# TODO: weigh matches under threshold against download count
matches = heapq.nlargest(
limit, ((jaro_winkler(s, n), n) for n in tokens_to_defns.keys()), key=lambda v: v[0]
)
defns = [Defn(*d) for _, m in matches for _, d in tokens_to_defns[m]]
results = await self.resolve(defns)
pkgs_by_defn = {d.with_name(r.slug): r for d, r in results.items() if is_pkg(r)}
return pkgs_by_defn
for channel in element_tree.getroot().findall('channel'):
found_match = False
match_scores = []
name__text = channel.find('display-name').text.encode(sys.stdout.encoding)
icon_url = channel.find('icon').attrib['src']
stripped_name = name__text.replace(" ", "")
for hdhr_name in hdhr_names:
score = 0
safe_hdhr_name = hdhr_name if is_ascii(hdhr_name) else hdhr_name.decode(sys.stdout.encoding)
stripped_hdhr_name = safe_hdhr_name.replace(" ", "")
try:
score = jellyfish.jaro_winkler(stripped_name, stripped_hdhr_name )
except UnicodeEncodeError:
try:
safe_name_text = name__text if is_ascii(name__text) else name__text.decode(sys.stdout.encoding)
logger.warn(u"Unable to do score for '{0}' vs '{1}'".format(safe_name_text, safe_hdhr_name))
except UnicodeEncodeError:
## Hvis vi heller ikke kan logge vores error pga. encoding, logger vi en ny error der er "sikker"
safe_hdhr_name = to_utf8(safe_hdhr_name)
safe_name_text = to_utf8(safe_name_text)
logger.warn(u"Unable to do score calculation for {0} - {1} - console encoding: {2}".format(safe_name_text, safe_hdhr_name, sys.stdout.encoding))
# logger.warn(name__text, " <-> ", hdhr_name, " isAscii: ", is_ascii(hdhr_name), " -- Safe version ", safe_name_text, " - ", safe_hdhr_name, " - sys encoding: ", sys.stdout.encoding)
match_scores.append(score)
maxValue = max(match_scores)
def _use_similarity(x, y):
if len(x) <= 1 or len(y) <= 1:
return -1.
# jaro_winkler crashes if slashes are provided.
return jellyfish.jaro_winkler(x, y)
f_not_found = open("not_found.txt","w")
f_negative = open("negative.txt","w")
f_positive = open("positive.txt","w")
tuples_not_found = set()
for t in total_uniq:
# try a direct match
per_extracted = t[0].decode("utf8").upper().strip()
org_truth = ground_truth.get(per_extracted)
found = False;
if org_truth:
# if there is a direct look for similar organisations
for org in org_truth:
score = jellyfish.jaro_winkler(org.encode("utf8"),t[1].upper())
if score>=0.8:
f_positive.write(t[0]+'\t'+t[1]+'\n')
positive += 1
found = True
break;
if found == False:
negative += 1
f_negative.write(t[0]+'\t'+t[1]+'\t\t:'+';'.join(org_truth).encode("utf8")+'\n')
else:
tuples_not_found.add(t)
not_found += 1
for t in tuples_not_found:
f_not_found.write(t[0]+'\t'+t[1]+'\n')