Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def scorers_processors():
"""
Generate a list of (scorer, processor) pairs for testing
:return: [(scorer, processor), ...]
"""
scorers = [fuzz.ratio,
fuzz.partial_ratio]
processors = [lambda x: x,
partial(utils.full_process, force_ascii=False),
partial(utils.full_process, force_ascii=True)]
splist = list(product(scorers, processors))
splist.extend(
[(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.UQRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.token_set_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.token_sort_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.partial_token_set_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.partial_token_sort_ratio, partial(utils.full_process, force_ascii=True))]
)
return splist
def find_closest_match(text_rows, word, default_compare=True, filter_words_with_quotation_marks=True):
#Todo: if the headword is more than one owrd, must look through ngrams instead of just split single words.
results = []
headword_size = len(word.split())
scorer = fuzz.UWRatio if default_compare else fuzz.UQRatio
for row in text_rows:
text_words = row[1].split()
if headword_size > 1:
text_words = create_ngrams(text_words, headword_size)
if filter_words_with_quotation_marks:
text_words = [w for w in text_words if '"' in w]
if len(text_words):
matched_word, score = process.extractOne(word.replace('"', ''), text_words, processor=laaz_process, scorer=scorer)
results.append((row[0], row[1], matched_word, score))
sresults = sorted(results, key=lambda x: x[-1], reverse=True)
top_res = sresults[0] if len(sresults) else None
# if strings are not similar enough in length, use a different comparison
if top_res and len(word) and default_compare and float(max(len(top_res[2]), len(word))) / min(len(top_res[2]), len(word)) >= 2:
top_res = find_closest_match(text_rows, word, default_compare=False)
return top_res
'ven_cln': t_cpeVen,
'fz_ratio': fz.ratio(
t_cpeVen,
t_arPub0),
'fz_ptl_ratio': fz.partial_ratio(
t_cpeVen,
t_arPub0),
'fz_tok_set_ratio': fz.token_set_ratio(
t_cpeVen,
t_arPub0,
force_ascii=False),
'fz_ptl_tok_sort_ratio': fz.partial_token_sort_ratio(
t_cpeVen,
t_arPub0,
force_ascii=False),
'fz_uwratio': fz.UWRatio(
t_cpeVen,
t_arPub0)
})
mycount = mycount + 1
if mycount % 1000 == 0:
self.logger.debug(
'# entries produced: {0}\n'.format(
mycount
)
)
# # debug code to shorten loop for testing
# if mycount > 1000:
# break
# # debug code to speed thru loops
def scorer(left, right):
if right.startswith("list of") or \
right.endswith(" topics") or \
right.startswith("wikiproject"):
val = 0
else:
val = UWRatio(left, right)
return val
'fz_ratio': fz.ratio(
t_cpe_titleX_tmp,
t_ar_dsply0_tmp
),
'fz_ptl_ratio': fz.partial_ratio(
t_cpe_titleX_tmp,
t_ar_dsply0_tmp
),
'fz_tok_set_ratio': fz_ptl_tok_set_ratio,
'fz_ptl_tok_sort_ratio': fz.token_sort_ratio(
t_cpe_titleX_tmp,
t_ar_dsply0_tmp,
force_ascii=False
),
'fz_uwratio': fz.UWRatio(
t_cpe_titleX_tmp,
t_ar_dsply0_tmp
),
'fz_rel_ratio': fz_rel_ratio,
'fz_rel_ptl_ratio': fz_rel_ptl_ratio,
't_cve_name': t_cve_name
})
m = m+1
n = n+1
if n % 100 < 1:
self.logger.debug(
'---Working ar: '
'sccm sft i/p: {0} '
', potential matches output: {1}\n'.format(n, m)