Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def testPartialTokenSetRatio(self):
self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)
# perfom a noop as it still needs to be a function
if processor is None:
processor = no_process
# Run the processor on the input query.
processed_query = processor(query)
if len(processed_query) == 0:
logging.warning(u"Applied processor reduces input query to empty string, "
"all comparisons will have score 0. "
"[Query: \'{0}\']".format(query))
# Don't run full_process twice
if scorer in [fuzz.WRatio, fuzz.QRatio,
fuzz.token_set_ratio, fuzz.token_sort_ratio,
fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
fuzz.UWRatio, fuzz.UQRatio] \
and processor == utils.full_process:
processor = no_process
# Only process the query once instead of for every choice
if scorer in [fuzz.UWRatio, fuzz.UQRatio]:
pre_processor = partial(utils.full_process, force_ascii=False)
scorer = partial(scorer, full_process=False)
elif scorer in [fuzz.WRatio, fuzz.QRatio,
fuzz.token_set_ratio, fuzz.token_sort_ratio,
fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio]:
pre_processor = partial(utils.full_process, force_ascii=True)
scorer = partial(scorer, full_process=False)
else:
pre_processor = no_process
processed_query = pre_processor(processed_query)
t_cpe_relX_tmp,
t_ar_ver0
)
if (t_cpe_relX_tmp != '-') and (t_ar_ver0 != '-'):
# If release data is specified, then check that
# there is at least a partial match
if fz_rel_ratio < 90 or fz_rel_ptl_ratio < 100:
continue
# 2) There should be at least one occurence of one word in
# the cpe full name somewhere in sccm full name
fz_ptl_tok_set_ratio = fz.partial_token_set_ratio(
t_cpe_titleX_tmp,
t_ar_dsply0_tmp,
force_ascii=False
)
if fz_ptl_tok_set_ratio < 70:
continue
######
# calculate fuzzy matching statistics for this match
######
lst_dict.append({
'vendor_X': t_cpe_vdr_X,
'software_X': t_cpe_sft_X,
'Version0': t_ar_ver0,
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
t_arPub0
) in df_arPub.itertuples():
# quick heuristics:
# a) 1st word of cpe Vendor string has to be in the
# tokenized wmi Publisher0 string somewhere
# b) condensed cpe name has to be shorter than the full
# WMI 'Publisher0' name
if len(t_cpeVen) > len(t_arPub0):
# self.logger.debug('arPub0 too short - continuing'
continue
# Look for at least one occurence of one word in cpeVen
# somewhere in arPub
if fz.partial_token_set_ratio(
t_cpeVen,
t_arPub0,
force_ascii=False
) < 100:
continue
# Calculate fuzzy matching statistics as "features" for
# the subsequent ML classification
lst_dict.append({
'publisher0': t_arPub0_orig,
'pub0_cln': t_arPub0,
'vendor_X': t_cpeVen_orig,
'ven_cln': t_cpeVen,
'fz_ratio': fz.ratio(
t_cpeVen,
def fuzzy(s1, s2):
return [fuzz.ratio(s1, s2) / 100,
fuzz.partial_ratio(s1, s2) / 100,
fuzz.token_sort_ratio(s1, s2) / 100,
fuzz.partial_token_sort_ratio(s1, s2) / 100,
fuzz.token_set_ratio(s1, s2) / 100,
fuzz.partial_token_set_ratio(s1, s2) / 100,
fuzz.QRatio(s1, s2) / 100,
fuzz.WRatio(s1, s2) / 100]