Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
:param agent: Agent id whose span is being entity linked
:param uuid: uuid of scenario containing KB for given agent
:return:
"""
# Use heuristic scoring system
#print 'span:', span
if not self.learned_lex:
entity_scores = []
for c in candidates:
#print 'c:', c
# Clean up punctuation
c_s = re.sub("-", " ", c[0])
span_tokens = span.split()
entity_tokens = c_s.split()
ed = editdistance.eval(span, c[0])
# Filter false positives
if c[1] not in kb_entity_types:
#print 'false type'
continue
def is_stopwords():
if span == c[0]:
return False
if len(span_tokens) == 1 and span in self.stop_words:
return True
if span_tokens[0] in ('and', 'or', 'to', 'from', 'of', 'in', 'at'):
return True
all_stop = True
for x in span_tokens:
if x not in self.stop_words:
all_stop = False
for i, y_hat in enumerate(y_hats):
y_true = ys_pad[i]
seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, ' ')
seq_hat_text = seq_hat_text.replace(self.recog_args.blank, '')
seq_true_text = "".join(seq_true).replace(self.recog_args.space, ' ')
hyp_words = seq_hat_text.split()
ref_words = seq_true_text.split()
word_eds.append(editdistance.eval(hyp_words, ref_words))
word_ref_lens.append(len(ref_words))
hyp_chars = seq_hat_text.replace(' ', '')
ref_chars = seq_true_text.replace(' ', '')
char_eds.append(editdistance.eval(hyp_chars, ref_chars))
char_ref_lens.append(len(ref_chars))
wer = 0.0 if not self.report_wer else float(sum(word_eds)) / sum(word_ref_lens)
cer = 0.0 if not self.report_cer else float(sum(char_eds)) / sum(char_ref_lens)
alpha = self.mtlalpha
if alpha == 0:
self.loss = self.loss_att
loss_att_data = float(self.loss_att)
loss_ctc_data_list = [None] * (self.num_encs + 1)
elif alpha == 1:
self.loss = torch.sum(torch.cat(
[(item * self.weights_ctc_train[i]).unsqueeze(0) for i, item in enumerate(self.loss_ctc_list)]))
loss_att_data = None
loss_ctc_data_list = [float(self.loss)] + [float(item) for item in self.loss_ctc_list]
else:
subprocess.check_output(
f'find "{escp_topdir}" -type f -name {esc_basename}',
shell=True, universal_newlines=True)\
.split('\n')
if x]
# Select which file to inline:
if len(candidates) == 1:
# If there's exactly one match, then we're done:
file_to_inline = candidates[0]
elif len(candidates) > 1:
# We have multiple candidates to inline, so we'll compare the
# full paths (relative to the top directory) to select the one
# whose name is the closest match:
rel_matches = [match[len(topdir) + 1:] for match in candidates]
distances = [editdistance.eval(include, path) for path in rel_matches]
min_distance = min(distances)
file_to_inline = candidates[distances.index(min_distance)]
log.debug(f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}")
else:
# We didn't find anything suitable:
file_to_inline = None
# Process the inline file:
if file_to_inline in stack:
# We've already inlined this file, so ignore it:
outlines.append(clgen.format_as_comment(
lang, f'[FETCH] ignored_include({line})'))
elif file_to_inline:
# Inline the file by recursively expanding its contents:
outlines.append(clgen.format_as_comment(
lang, f'[FETCH] begin_include({line})'))
def dmetr(name1, name2):
max_len = max(len(name1), len(name2))
max_dist = int(ceil(max_len*(1.0-thresh)))
ldist = levenshtein(name1, name2)
return (1.0 - float(ldist)/max_len) if (ldist != -1 and max_len != 0) else 0.0
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
except:
break
r1 = next(p).rstrip()
r2 = next(p).rstrip()
total_reads += 1
total_ref_bp += len(r1)
if r1 == r2:
continue
if len(r1) != len(r2):
num_adapters += 1
l1 = len(r1)
n = min(len(r2) - l1, len(adapter))
total_adapter_bp += n
total_adapter_edit_dist += editdistance.eval(r2[l1:(l1+n)], adapter[0:n])
r2 = r2[0:l1]
if r1 != r2:
num_reads_mismatch += 1
total_edit_dist += editdistance.eval(r1,r2)
w.writerow((i,
num_adapters, num_reads_mismatch, total_reads,
total_edit_dist, total_ref_bp,
total_adapter_edit_dist, total_adapter_bp
))
file2_metadata = metadata_dict[file2]
intersect_features = set(file1_metadata.keys()) & set(file2_metadata.keys())
intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]
file_edit_distance = 0.0
for feature in intersect_features:
file1_feature_value = stringify(file1_metadata[feature])
file2_feature_value = stringify(file2_metadata[feature])
if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
feature_distance = 0.0
else:
feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
file_edit_distance += feature_distance
if allKeys:
file1_only_features = set(file1_metadata.keys()) - set(intersect_features)
file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]
file2_only_features = set(file2_metadata.keys()) - set(intersect_features)
file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]
file_edit_distance += len(file1_only_features) + len(file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))
else:
file_edit_distance /= float(len(intersect_features)) #average edit distance
api = tweepy.API(auth,wait_on_rate_limit=True)
c = tweepy.Cursor(api.search,q='to:'+'elonmusk',since_id='1017463575919693826',include_entities=True).items()
while True:
try:
tweet = c.next()
text = tweet.text
name = tweet.user.name
screen_name = tweet.user.screen_name
tweet_id = tweet.id
print(tweet_id)
if screen_name != 'elonmusk':
name_dist = editdistance.eval('Elon Musk', name)
screen_dist = editdistance.eval('elonmusk',screen_name)
if name_dist <= 2 or screen_dist <= 4:
print("fake found")
api.update_status("*❗️❗️beep boop❗️❗️* Fake Elon Musk detected❎, report as spam ❎ @elonmusk @{}".format(screen_name), in_reply_to_status_id=tweet_id)
api.report_spam([screen_name])
except tweepy.TweepError:
print("limit reached")
time.sleep(60*5)
continue
except StopIteration:
print("end of result")
time.sleep(10)
c = tweepy.Cursor(api.search,q='to:'+'elonmusk',since_id=tweet_id,include_entities=True).items()
continue
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def levenshtein_avg(weights, seq1, seq2):
norm = .5 * (len(seq1) + len(seq2))
return 1 - (editdistance.eval(seq1, seq2) / norm)