Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
name2 = simple_clean(name2_name.last) + " " + unnickname(name2_name.first)
# calculate a buncha metrics
text1 = name1_standardized
text2 = name2
#print "comparing '%s' to '%s'" % (text1, text2)
ratio = 1/100.0*fuzz.ratio(text1, text2)
partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2)
token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2)
token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2)
avg_len = 1/2*len(text1)+len(text2)
min_len = min(len(text1), len(text2))
l_ratio = 0
try:
l_distance = jellyfish.levenshtein_distance(text1, text2)
l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) )
except UnicodeEncodeError:
pass
long_match = longest_match(text1, text2)
lng_ratio = (0.0 + long_match) / (0.0 + min_len)
score = 0
if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6):
score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio])
if debug:
log.debug("|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s" % (match['cand_id'], match['cand_name'], name, score, ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio))
if (score > 0.8):
if prop['pid'] in ('state', 'chamber'):
spec[prop['pid']] = prop['v']
legislators = db.legislators.find(spec)
results = []
for leg in legislators:
if legislators.count() == 1:
match = True
score = 100
else:
match = False
if leg['last_name'] == query['query']:
score = 90
else:
distance = levenshtein_distance(leg['full_name'].lower(),
query['query'].lower())
score = 100.0 / (1 + distance)
# Note: There's a bug in Refine that causes reconciliation
# scores to be overwritten if the same legislator is returned
# for multiple queries. see:
# http://code.google.com/p/google-refine/issues/detail?id=185
results.append({"id": leg['_id'],
"name": leg['full_name'],
"score": score,
"match": match,
"type": [
{"id": "/billy/legislator",
"name": "Legislator"}]})
if prop['pid'] in ('state', 'chamber'):
spec[prop['pid']] = prop['v']
legislators = db.legislators.find(spec)
results = []
for leg in legislators:
if legislators.count() == 1:
match = True
score = 100
else:
match = False
if leg['last_name'] == query['query']:
score = 90
else:
distance = levenshtein_distance(leg['full_name'].lower(),
query['query'].lower())
score = 100.0 / (1 + distance)
# Note: There's a bug in Refine that causes reconciliation
# scores to be overwritten if the same legislator is returned
# for multiple queries. see:
# http://code.google.com/p/google-refine/issues/detail?id=185
results.append({"id": leg['_id'],
"name": leg['full_name'],
"score": score,
"match": match,
"type": [
{"id": "/billy/legislator",
"name": "Legislator"}]})
v_cl[self.v_chain_type[v]] = []
if self.v_chain_type[v] == "IGHV":
v_cl[self.v_chain_type[v]].append(v)
f, s = pSeq[:pos1[1]], pSeq[pos1[1] + 1:]
v_overlap = len(f) + len(s) + 1
for v1, v2 in v_cl.items():
for v3 in v2:
if v3 not in self.vi_pieces:
continue
v, vv = self.vi_pieces[v3]
minlen1 = min(len(f), len(v))
minlen2 = min(len(s), len(vv))
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
else:
mismatch1 = 0
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
else:
mismatch2 = 0
if (minlen1 <= 3 and mismatch2 <= 1) or (minlen1 >= self.__settings.minlen1 and mismatch1 <= self.__settings.mismatch1 and minlen2 >= self.__settings.minlen2 and mismatch2 <= self.__settings.mismatch2):
vtypes[v3] = (minlen1 + minlen2 + 1, mismatch1 + mismatch2)
if pos2 != [-1, -1]:
if pos2[0] != -1:
if True: #pos2[0] + 3 < len(pSeq) and pSeq[pos2[0] + 3] == "G":
if pos2[0] > 10:
offset = pos2[0] - 10
else:
j_cl = {}
for j in jc:
if self.j_chain_type[j] != "IGHJ" and self.j_chain_type[j] not in j_cl:
j_cl[self.j_chain_type[j]] = []
if self.j_chain_type[j] != "IGHJ":
j_cl[self.j_chain_type[j]].append(j)
f, s = pSeq[:pos2[0]], pSeq[pos2[0] + 1:]
for j1, j2 in j_cl.items():
for j3 in j2:
if j3 not in self.jay_pieces:
continue
j, jj = self.jay_pieces[j3]
minlen1 = min(len(f), len(j))
minlen2 = min(len(s), len(jj))
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(jj[:minlen2]))
else:
mismatch2 = 0
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(j[-minlen1:]))
else:
mismatch1 = 0
if (minlen2 == 0 and mismatch1 <= 1) or (minlen2 > 3 and mismatch2 <= 1 and minlen1 >= 2 and mismatch1 <= 2):
jtypes[j3] = mismatch1 + mismatch2
if pos2[1] != -1:
if pos2[1] > 10:
offset = pos2[1] - 10
else:
offset = 0
kmrs2 = self.kmers(pSeq[offset:], 3)
interJ = set(kmrs2) & jkeys
jlist = []
def levenshtein_apply(x):
try:
return 1 - jellyfish.levenshtein_distance(x[0], x[1]) \
/ np.max([len(x[0]), len(x[1])])
except Exception as err:
if pandas.isnull(x[0]) or pandas.isnull(x[1]):
return np.nan
else:
raise err
v_cl[self.v_chain_type[v]] = []
v_cl[self.v_chain_type[v]].append(v)
f, s = pSeq[:pos1], pSeq[pos1 + 1:]
for v1, v2 in v_cl.items():
for v3 in v2:
if v3 not in self.vi_pieces:
continue
v, vv = self.vi_pieces[v3]
minlen1 = min(len(f), len(v))
minlen2 = min(len(s), len(vv))
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
else:
mismatch1 = 0
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
else:
mismatch2 = 0
if (minlen1 == 0 and mismatch2 <= 1) or (minlen1 > 3 and mismatch1 <= 1 and minlen2 >= 2 and mismatch2 <= 2):
vtypes[v3] = mismatch1 + mismatch2
if pos2 != [-1, -1]:
if pos2[0] != -1:
if pos2[1] > 10:
offset = pos2[1] - 10
else:
offset = 0
kmrs2 = self.kmers(pSeq[offset:], 3)
interJ = set(kmrs2) & jkeys
jlist = []
for j in interJ:
jlist.extend(list(self.hashJ[j]))
if jlist:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
else:
mismatch1 = 0
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
else:
mismatch2 = 0
if (minlen1 == 0 and mismatch2 == 0) or (minlen1 > 0 and mismatch1 <= 1 and minlen2 > 0 and mismatch2 <= 2):
vtypes.add(v_t)
if pos2 != -1:
f, s = pSeq[:pos2], pSeq[pos2 + 1:]
for j, jj, j_t in self.jays[chain_name]:
minlen1 = min(len(f), len(j))
minlen2 = min(len(s), len(jj))
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(jj[:minlen2]))
else:
mismatch2 = 0
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(j[-minlen1:]))
else:
mismatch1 = 0
if (minlen2 == 0 and mismatch1 <= 1) or (minlen2 > 0 and mismatch2 <= 1 and minlen1 > 0 and mismatch1 <= 0):
jtypes.add(j_t)
if vtypes and jtypes:
found = True
cdr3 = pSeq[pos1: pos2 + 1]
if cdr3 not in self.pSeq_read_map:
full_cdr3.append(cdr3)
self.pSeq_read_map[cdr3] = {"v": vtypes, "j": jtypes, "chain_type": chain_name}
elif vtypes:
found = True
pos2 = pSeq.rfind(letter)
if pos1 != -1 and pos2 != -1 and pos2 - pos1 < 5:
continue
vtypes = set()
jtypes = set()
if pos1 != -1:
f, s = pSeq[:pos1], pSeq[pos1 + 1:]
for v, vv, v_t in self.vis[chain_name]:
minlen1 = min(len(f), len(v))
minlen2 = min(len(s), len(vv))
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
else:
mismatch1 = 0
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
else:
mismatch2 = 0
if (minlen1 == 0 and mismatch2 == 0) or (minlen1 > 0 and mismatch1 <= 1 and minlen2 > 0 and mismatch2 <= 2):
vtypes.add(v_t)
if pos2 != -1:
f, s = pSeq[:pos2], pSeq[pos2 + 1:]
for j, jj, j_t in self.jays[chain_name]:
minlen1 = min(len(f), len(j))
minlen2 = min(len(s), len(jj))
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(jj[:minlen2]))
else:
mismatch2 = 0
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(j[-minlen1:]))
else:
features_cand.append(doc_id)
if keys != None:
if rel:
columns.append('rel')
if self.unique_kw in keys or isVirtual:
features_cand.append(1)
seen.add(self.unique_kw)
else:
features_cand.append(0)
if rel_approx:
columns.append('rel_approx')
max_gold_ = ('', 0.)
for gold_key in keys:
dist = 1.-jellyfish.levenshtein_distance(gold_key, self.unique_kw ) / max(len(gold_key), len(self.unique_kw)) # _tL
if max_gold_[1] < dist:
max_gold_ = ( gold_key, dist )
features_cand.append(max_gold_[1])
columns.append('kw')
features_cand.append(self.unique_kw)
columns.append('h')
features_cand.append(self.H)
columns.append('tf')
features_cand.append(self.tf)
columns.append('size')
features_cand.append(self.size)
columns.append('isVirtual')
features_cand.append(int(isVirtual))
for feature_name in features: