Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_service_metadata(self):
self.maxDiff = None
response = self.client.get('/api/1.0/refine/reconcile', {'callback': 'jsonp123'})
self.assertEqual(200, response.status_code)
self.assertEqual(100,
fuzz.token_sort_ratio(
'jsonp123({"name": "Influence Explorer Reconciliation3", "identifierSpace": "http://staging.influenceexplorer.com/ns/entities", "schemaspace": "http://staging.influenceexplorer.com/ns/entity.object.id", "view": { "url": "http://staging.influenceexplorer.com/entity/{{id}}" }, "preview": { "url": "http://staging.influenceexplorer.com/entity/{{id}}", "width": 430, "height": 300 }, "defaultTypes": []})',
response.content
)
pref_name) in automation.iter(text_to_tag.lower()):
start_index = end_index - len(match) + 1
end_index += 1
if (start_index == 0 or text_to_tag[start_index - 1] in BioEntityTagger.separators_all) and \
(end_index == len(text_to_tag) or text_to_tag[end_index] in BioEntityTagger.separators_all):
for j in range(len(category_list)):
category = category_list[j]
reference_db = reference_db_list[j]
entity_id = entity_id_list[j]
if isinstance(entity_id, list):
entity_id = entity_id[0]
if category.endswith('-TOKEN'):
pre, post = original_value.split(match)[:2]
potential_match = text_to_tag[start_index:end_index + len(post)]
score = fuzz.token_sort_ratio(original_value, potential_match)
if score > 90:
tag = MatchedTag(match, start_index, end_index, category.replace('-TOKEN', ''),
reference_db,
entity_id, original_value, pref_name)
matches.append(tag.__dict__)
else:
tag = MatchedTag(match, start_index, end_index, category, reference_db, entity_id,
original_value, pref_name)
matches.append(tag.__dict__)
else:
pass
grouped_matches = BioEntityTagger.group_matches_by_category_and_reference(matches)
filtered_matches = []
for group, matches_in_group in grouped_matches.items():
non_nested_matches = BioEntityTagger.remove_nested_matches(matches_in_group)
else:
url = api_base_url + urllib.quote(query) + '&logop=and¬es='
app.logger.debug("AAT url is " + url)
resp = requests.get(url)
results = ET.fromstring(resp.content)
except getopt.GetoptError as e:
app.logger.warning(e)
return out
for child in results.iter('Preferred_Parent'):
match = False
try:
name = re.sub(r'\[.+?\]', '', child.text.split(',')[0]).strip()
# the termid is NOT the ID ! We have to find it in the first prefered parent
id = re.search(r"\[(.+?)\]", child.text.split(',')[0]).group(1)
score = fuzz.token_sort_ratio(query, name)
except AttributeError:
pass
if score > 95:
match = True
app.logger.debug("Label is " + name + " Score is " +
str(score) + " URI is " + id)
resource = {
"id": id,
"name": name,
"score": score,
"match": match,
"type": query_type_meta
}
out.append(resource)
# Sort this list containing prefterms by score
def pair_aligned_propositions(propositions, pronouns):
"""
Align predicates with the same arguments in different sentences
:param propositions: the (sent, pred, arg1, arg2) tuples
:return: a list of aligned_prop
"""
predicate_alignments = []
candidates = get_candidate_pairs(propositions, pronouns)
for (tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1, tweet_id2, sent2, sf_pred2, pred2, s1_a0, s1_a1) in candidates:
# Same tweet
if fuzz.token_sort_ratio(sent1, sent2) >= 95:
continue
# Same predicates?
if is_eq_preds(pred1, pred2):
continue
# Same arguments?
is_eq_a0_a0, is_eq_a1_a1, is_eq_a0_a1, is_eq_a1_a0 = \
is_eq_arg(s0_a0, s1_a0), is_eq_arg(s0_a1, s1_a1), is_eq_arg(s0_a0, s1_a1), is_eq_arg(s0_a1, s1_a0)
# Are arguments aligned?
is_aligned_a0_a0 = is_eq_a0_a0 or is_aligned_arg(s0_a0, s1_a0)
is_aligned_a1_a1 = is_eq_a1_a1 or is_aligned_arg(s0_a1, s1_a1)
is_aligned_a0_a1 = is_eq_a0_a1 or is_aligned_arg(s0_a0, s1_a1)
is_aligned_a1_a0 = is_eq_a1_a0 or is_aligned_arg(s0_a1, s1_a0)
def preevaluate_filenames(thresholds, right_files, left_file):
# We won't enter preevaluate_filenames, if tf >= 1.0
candidates = set()
for right_file in right_files:
sim = fuzz.token_sort_ratio(left_file, right_file) / 100
if sim < thresholds.filename:
continue
candidates.add(right_file)
return left_file, candidates
# relation correction
C_pruned = []
for mid in set(C):
if mid in index_reach.keys(): # PROBLEM: don't know why this may not exist??
count_mid = C.count(mid) # count number of times mid appeared in C
C_pruned.append((mid, count_mid))
C_tfidf_pruned = []
for mid, count_mid in C_pruned:
if mid in index_names.keys():
cand_ent_name = pick_best_name(question[2], index_names[mid])
if args.sim == "custom":
tfidf = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
simple_match = fuzz.ratio(cand_ent_name, question) / 100.0
token_sort_ratio = fuzz.token_sort_ratio(cand_ent_name, question) / 100.0
score = tfidf * 0.01 + simple_match + token_sort_ratio
elif args.sim == "fuzzy":
score = fuzzy_match_score(cand_ent_name, query_text)
else:
score = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
C_tfidf_pruned.append((mid, cand_ent_name, score))
# print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))
if len(C_tfidf_pruned) == 0:
#print("WARNING: C_tfidf_pruned is empty.")
notfound_c_lineids.append(lineid)
notfound_c += 1
continue
C_tfidf_pruned.sort(key=lambda t: -t[2])
cand_mids = C_tfidf_pruned[:HITS_TOP_ENTITIES]
alt = alternate[0]
else:
alt = ''
geonames_id = item.get('geonameId')
geonames_uri = make_uri(geonames_id)
lat = item.get('lat')
lng = item.get('lng')
#Way to cheat + get name + coordinates into results:
name_coords = name + ' | ' + lat + ', ' + lng
#Avoid returning duplicates:
if geonames_id in unique_geonames_ids:
continue
else:
unique_geonames_ids.append(geonames_id)
score_1 = fuzz.token_sort_ratio(query, name)
score_2 = fuzz.token_sort_ratio(query, alt)
score = max(score_1, score_2)
if query == text.normalize(name, PY3):
match = True
elif query == text.normalize(alt, PY3):
match = True
resource = {
"id": geonames_uri,
"name": name_coords,
"score": score,
"match": match,
"type": query_type_meta
}
out.append(resource)
#Sort this list by score
sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
#Refine only will handle top three matches.
def compare_hunks(left, right):
# This case happens for example, if both hunks remove empty newlines
# This check is _required_ as fuzzywuzzy currently contains a bug that
# does misevaluations in case of equivalence. See
# https://github.com/seatgeek/fuzzywuzzy/issues/196
if left == right:
return 100
return fuzz.token_sort_ratio(left, right)
if (len(alternate) > 0):
alt = alternate[0]
else:
alt = ''
geonames_id = item.get('geonameId')
geonames_uri = make_uri(geonames_id)
lat = item.get('lat')
lng = item.get('lng')
#Way to cheat + get name + coordinates into results:
name_coords = name + ' | ' + lat + ', ' + lng
#Avoid returning duplicates:
if geonames_id in unique_geonames_ids:
continue
else:
unique_geonames_ids.append(geonames_id)
score_1 = fuzz.token_sort_ratio(query, name)
score_2 = fuzz.token_sort_ratio(query, alt)
score = max(score_1, score_2)
if query == text.normalize(name, PY3):
match = True
elif query == text.normalize(alt, PY3):
match = True
resource = {
"id": geonames_uri,
"name": name_coords,
"score": score,
"match": match,
"type": query_type_meta
}
out.append(resource)
#Sort this list by score
sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
def fuzzy_for_domains(utterance, context_ls):
ratio_ls = []
for i in range(len(context_ls)):
ratio_sum = 0
ratio_sum += fuzz.ratio(utterance, context_ls[i][0]+context_ls[i][1])
ratio_sum += fuzz.partial_ratio(utterance, context_ls[i][0]+context_ls[i][1])
ratio_sum += fuzz.token_sort_ratio(utterance, context_ls[i][0]+context_ls[i][1])
ratio_sum += fuzz.token_set_ratio(utterance, context_ls[i][0]+context_ls[i][1])
mean_ratio = ratio_sum / 4
ratio_ls.append(mean_ratio)
return normalization(ratio_ls)