Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for p in dev_data.values():
paragraph_map[(p.article_id, p.paragraph_num)] = p.context
np.random.shuffle(answers)
# tmp = open("/tmp/tmp.csv", "w")
for prediction in answers:
point = dev_data[prediction.question_id]
start, end = prediction.doc_span
context = paragraph_map[(point.article_id, prediction.paragraph_num)]
text = origin_mapping.get_raw_text(point.article_id, prediction.paragraph_num, start, end)
text_f1 = 0
for ans in point.answer:
text_f1 = max(text_f1, text_f1_score(text, ans.text))
ans_sent = 0
offset = 0
while end >= offset+len(context[ans_sent]):
offset += len(context[ans_sent])
ans_sent += 1
sent_start = start-offset
sent_end = end - offset
question_words = set(x.lower() for x in point.question if x.lower() not in stop)
if prediction.paragraph_num != point.paragraph_num and text_f1 == 0:
# tmp.write(" ".join(point.question))
# tmp.write("\t" + point.article_title)
# tmp.write("\t" + text)
# tmp.write("\t" + str(list(set(x.text for x in point.answer))))
for q, spans, span_vals in pred.eval(doc, sample_per_doc, sess):
correct_para_num = quid_to_paragraph[q.question_id].paragraph_num
predicted_paragraph = np.argmax(span_vals)
answer = QuestionAnswer(q.question_id, predicted_paragraph,
spans[predicted_paragraph], spans[correct_para_num])
question_answers.append(QuestionAnswerFull(q.question_id, span_vals, spans))
q_span_f1 = 0
q_text_f1 = 0
para_text = doc.paragraphs[correct_para_num].get_original_text(*answer.para_span)
for ans in q.answer:
q_span_f1 = max(q_span_f1, compute_span_f1((ans.para_word_start, ans.para_word_end), answer.para_span))
q_text_f1 = max(q_text_f1, squad_official_f1_score(para_text, ans.text))
doc_span_f1 = 0
doc_text_f1 = 0
if answer.paragraph_num == correct_para_num:
doc_span_f1, doc_text_f1 = q_span_f1, q_text_f1
else:
doc_text = doc.paragraphs[answer.paragraph_num].get_original_text(*answer.doc_span)
for ans in q.answer:
doc_text_f1 = max(doc_text_f1, squad_official_f1_score(doc_text, ans.text))
question_results.append(QuestionResult(q.question_id, predicted_paragraph == correct_para_num,
doc_span_f1, doc_text_f1, q_span_f1, q_text_f1))
print("CorrectPara=%.4f, DocSpanF1=%.4f, DocTextF1=%.4f, ParaSpanF1=%.4f, ParaTextF1=%.4f" % (
np.mean([x.right_para for x in question_results]),
np.mean([x.doc_span_f1 for x in question_results]),
table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
print_table([header] + transpose_lists(table))
if args.official_output is not None:
q_id_to_answers = {}
q_ids = evaluation.per_sample["question_id"]
spans = evaluation.per_sample["spans"]
confs = evaluation.per_sample["none_prob"]
score = 0
for q_id, (start, end), conf in zip(q_ids, spans, confs):
answer = quid_to_q[q_id].answer
if conf < 0.15:
text = " ".join(quid_to_q[q_id].get_context()[start:end+1])
if len(answer.answer_text) > 0:
score += max(f1_score(a, text) for a in answer.answer_text)
q_id_to_answers[q_id] = text
else:
if len(answer.answer_text) == 0:
score += 1
q_id_to_answers[q_id] = ""
print("Score: %.4f" % (score/len(q_ids)))
with open(args.official_output + name + ".json", "w") as f:
json.dump(q_id_to_answers, f)
#
q_span_f1 = 0
q_text_f1 = 0
para_text = doc.paragraphs[correct_para_num].get_original_text(*answer.para_span)
for ans in q.answer:
q_span_f1 = max(q_span_f1, compute_span_f1((ans.para_word_start, ans.para_word_end), answer.para_span))
q_text_f1 = max(q_text_f1, squad_official_f1_score(para_text, ans.text))
doc_span_f1 = 0
doc_text_f1 = 0
if answer.paragraph_num == correct_para_num:
doc_span_f1, doc_text_f1 = q_span_f1, q_text_f1
else:
doc_text = doc.paragraphs[answer.paragraph_num].get_original_text(*answer.doc_span)
for ans in q.answer:
doc_text_f1 = max(doc_text_f1, squad_official_f1_score(doc_text, ans.text))
question_results.append(QuestionResult(q.question_id, predicted_paragraph == correct_para_num,
doc_span_f1, doc_text_f1, q_span_f1, q_text_f1))
print("CorrectPara=%.4f, DocSpanF1=%.4f, DocTextF1=%.4f, ParaSpanF1=%.4f, ParaTextF1=%.4f" % (
np.mean([x.right_para for x in question_results]),
np.mean([x.doc_span_f1 for x in question_results]),
np.mean([x.doc_text_f1 for x in question_results]),
np.mean([x.para_span_f1 for x in question_results]),
np.mean([x.para_text_f1 for x in question_results])
))
if output is not None:
with open(output, "wb") as f:
pickle.dump(question_answers, f)
print('Done')
para = data[i]
pred_span = tuple(prediction[i])
pred_text = para.get_original_text(pred_span[0], pred_span[1])
span_correct = False
span_max_f1 = 0
text_correct = 0
text_max_f1 = 0
answer = data[i].answer
for (start, end), text in zip(answer.answer_spans, answer.answer_text):
answer_span = (start, end)
span_max_f1 = max(span_max_f1, compute_span_f1(answer_span, pred_span))
if answer_span == pred_span:
span_correct = True
f1 = squad_official_f1_score(pred_text, text)
correct = squad_official_em_score(pred_text, text)
text_correct = max(text_correct, correct)
text_max_f1 = max(text_max_f1, f1)
scores[i] = [span_correct, span_max_f1, text_correct, text_max_f1]
return scores
n_questions += len(para.questions)
for question in para.questions:
answer = answers[question.question_id]
best_val = -1
text_f1 = -1
span_f1 = 0
for r, i in enumerate(np.argsort(-np.array(para_predictions[question.question_id]))):
val = answer.span_vals[i]
if val > best_val:
best_val = val
answer_text = doc.paragraphs[i].get_original_text(*answer.spans[i])
text_f1 = 0
for ans in question.answer:
text_f1 = max(text_f1, squad_official_f1_score(answer_text, ans.text))
span_f1 = 0
if i == para.paragraph_num: # correct paragraph
for ans in question.answer:
span_f1 = max(span_f1, compute_span_f1(answer.spans[i], (ans.para_word_start, ans.para_word_end)))
top_n_f1_score[r] += text_f1
top_n_span_score[r] += span_f1
top_n_f1_score[len(doc.paragraphs):max_para_len] += text_f1
top_n_span_score[len(doc.paragraphs):max_para_len] += text_f1
plt.plot(np.arange(0, max_para_len)+1, top_n_f1_score/n_questions)
plt.show()
corpus.get_resource_loader(), checkpoint, args.ema)
print("Choosing threshold")
e = evaluation["dev"].per_sample
q_ids = e["question_id"]
spans = e["spans"]
confs = e["none_prob"]
for th in [0, 0.1, 0.15, 0.2, 0.25]:
score = 0
none = 0
for q_id, (start, end), conf in zip(q_ids, spans, confs):
answer = quid_to_q[q_id].answer
if conf < th:
text = " ".join(quid_to_q[q_id].get_context()[start:end + 1])
if len(answer.answer_text) > 0:
score += max(f1_score(a, text) for a in answer.answer_text)
else:
none += 1
if len(answer.answer_text) == 0:
score += 1
print("%s: %.4f (predicted %d (%.4f))" % (str(th), score/len(q_ids), none, none/len(q_ids)))
# Print the scalar results in a two column table
for name, evaluation in evaluation.items():
scalars = evaluation.scalars
cols = list(sorted(scalars.keys()))
table = [cols]
header = ["Metric", ""]
table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
print_table([header] + transpose_lists(table))
if args.official_output is not None: