How to use the squad.squad_official_evaluation.f1_score function in squad

To help you get started, we’ve selected a few squad examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / document-qa / docqa / data_analysis / visualize_full_doc_errors.py View on Github external
for p in dev_data.values():
        paragraph_map[(p.article_id, p.paragraph_num)] = p.context

    np.random.shuffle(answers)
    # tmp = open("/tmp/tmp.csv", "w")

    for prediction in answers:
        point = dev_data[prediction.question_id]
        start, end = prediction.doc_span

        context = paragraph_map[(point.article_id, prediction.paragraph_num)]
        text = origin_mapping.get_raw_text(point.article_id, prediction.paragraph_num, start, end)

        text_f1 = 0
        for ans in point.answer:
            text_f1 = max(text_f1, text_f1_score(text, ans.text))

        ans_sent = 0
        offset = 0
        while end >= offset+len(context[ans_sent]):
            offset += len(context[ans_sent])
            ans_sent += 1
        sent_start = start-offset
        sent_end = end - offset

        question_words = set(x.lower() for x in point.question if x.lower() not in stop)

        if prediction.paragraph_num != point.paragraph_num and text_f1 == 0:
            # tmp.write(" ".join(point.question))
            # tmp.write("\t" + point.article_title)
            # tmp.write("\t" + text)
            # tmp.write("\t" + str(list(set(x.text for x in point.answer))))
github allenai / document-qa / eval / full_document_eval.py View on Github external
for q, spans, span_vals in pred.eval(doc, sample_per_doc, sess):
            correct_para_num = quid_to_paragraph[q.question_id].paragraph_num

            predicted_paragraph = np.argmax(span_vals)
            answer = QuestionAnswer(q.question_id, predicted_paragraph,
                                    spans[predicted_paragraph], spans[correct_para_num])

            question_answers.append(QuestionAnswerFull(q.question_id, span_vals, spans))

            q_span_f1 = 0
            q_text_f1 = 0
            para_text = doc.paragraphs[correct_para_num].get_original_text(*answer.para_span)

            for ans in q.answer:
                q_span_f1 = max(q_span_f1, compute_span_f1((ans.para_word_start, ans.para_word_end), answer.para_span))
                q_text_f1 = max(q_text_f1, squad_official_f1_score(para_text, ans.text))

            doc_span_f1 = 0
            doc_text_f1 = 0
            if answer.paragraph_num == correct_para_num:
                doc_span_f1, doc_text_f1 = q_span_f1, q_text_f1
            else:
                doc_text = doc.paragraphs[answer.paragraph_num].get_original_text(*answer.doc_span)
                for ans in q.answer:
                    doc_text_f1 = max(doc_text_f1, squad_official_f1_score(doc_text, ans.text))

            question_results.append(QuestionResult(q.question_id, predicted_paragraph == correct_para_num,
                                                   doc_span_f1, doc_text_f1, q_span_f1, q_text_f1))

        print("CorrectPara=%.4f, DocSpanF1=%.4f, DocTextF1=%.4f, ParaSpanF1=%.4f, ParaTextF1=%.4f" % (
            np.mean([x.right_para for x in question_results]),
            np.mean([x.doc_span_f1 for x in question_results]),
github allenai / document-qa / experimental / niket_eval.py View on Github external
table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
        print_table([header] + transpose_lists(table))

        if args.official_output is not None:
            q_id_to_answers = {}
            q_ids = evaluation.per_sample["question_id"]
            spans = evaluation.per_sample["spans"]
            confs = evaluation.per_sample["none_prob"]
            score = 0
            for q_id, (start, end), conf in zip(q_ids, spans, confs):
                answer = quid_to_q[q_id].answer

                if conf < 0.15:
                    text = " ".join(quid_to_q[q_id].get_context()[start:end+1])
                    if len(answer.answer_text) > 0:
                        score += max(f1_score(a, text) for a in answer.answer_text)
                    q_id_to_answers[q_id] = text
                else:
                    if len(answer.answer_text) == 0:
                        score += 1
                    q_id_to_answers[q_id] = ""

            print("Score: %.4f" % (score/len(q_ids)))

            with open(args.official_output + name + ".json", "w") as f:
                json.dump(q_id_to_answers, f)
#
github allenai / document-qa / eval / full_document_eval.py View on Github external
q_span_f1 = 0
            q_text_f1 = 0
            para_text = doc.paragraphs[correct_para_num].get_original_text(*answer.para_span)

            for ans in q.answer:
                q_span_f1 = max(q_span_f1, compute_span_f1((ans.para_word_start, ans.para_word_end), answer.para_span))
                q_text_f1 = max(q_text_f1, squad_official_f1_score(para_text, ans.text))

            doc_span_f1 = 0
            doc_text_f1 = 0
            if answer.paragraph_num == correct_para_num:
                doc_span_f1, doc_text_f1 = q_span_f1, q_text_f1
            else:
                doc_text = doc.paragraphs[answer.paragraph_num].get_original_text(*answer.doc_span)
                for ans in q.answer:
                    doc_text_f1 = max(doc_text_f1, squad_official_f1_score(doc_text, ans.text))

            question_results.append(QuestionResult(q.question_id, predicted_paragraph == correct_para_num,
                                                   doc_span_f1, doc_text_f1, q_span_f1, q_text_f1))

        print("CorrectPara=%.4f, DocSpanF1=%.4f, DocTextF1=%.4f, ParaSpanF1=%.4f, ParaTextF1=%.4f" % (
            np.mean([x.right_para for x in question_results]),
            np.mean([x.doc_span_f1 for x in question_results]),
            np.mean([x.doc_text_f1 for x in question_results]),
            np.mean([x.para_span_f1 for x in question_results]),
            np.mean([x.para_text_f1 for x in question_results])
        ))

    if output is not None:
        with open(output, "wb") as f:
            pickle.dump(question_answers, f)
    print('Done')
github allenai / document-qa / squad / squad_evaluators.py View on Github external
para = data[i]

        pred_span = tuple(prediction[i])
        pred_text = para.get_original_text(pred_span[0], pred_span[1])

        span_correct = False
        span_max_f1 = 0
        text_correct = 0
        text_max_f1 = 0
        answer = data[i].answer
        for (start, end), text in zip(answer.answer_spans, answer.answer_text):
            answer_span = (start, end)
            span_max_f1 = max(span_max_f1, compute_span_f1(answer_span, pred_span))
            if answer_span == pred_span:
                span_correct = True
            f1 = squad_official_f1_score(pred_text, text)
            correct = squad_official_em_score(pred_text, text)
            text_correct = max(text_correct, correct)
            text_max_f1 = max(text_max_f1, f1)

        scores[i] = [span_correct, span_max_f1, text_correct, text_max_f1]

    return scores
github allenai / document-qa / experimental / batch_paragraph_selection / show_paragraph_selection_fixes.py View on Github external
n_questions += len(para.questions)
            for question in para.questions:
                answer = answers[question.question_id]

                best_val = -1
                text_f1 = -1
                span_f1 = 0
                for r, i in enumerate(np.argsort(-np.array(para_predictions[question.question_id]))):
                    val = answer.span_vals[i]
                    if val > best_val:
                        best_val = val

                        answer_text = doc.paragraphs[i].get_original_text(*answer.spans[i])
                        text_f1 = 0
                        for ans in question.answer:
                            text_f1 = max(text_f1, squad_official_f1_score(answer_text, ans.text))

                        span_f1 = 0
                        if i == para.paragraph_num:  # correct paragraph
                            for ans in question.answer:
                                span_f1 = max(span_f1, compute_span_f1(answer.spans[i], (ans.para_word_start, ans.para_word_end)))

                    top_n_f1_score[r] += text_f1
                    top_n_span_score[r] += span_f1

                top_n_f1_score[len(doc.paragraphs):max_para_len] += text_f1
                top_n_span_score[len(doc.paragraphs):max_para_len] += text_f1

    plt.plot(np.arange(0, max_para_len)+1, top_n_f1_score/n_questions)
    plt.show()
github allenai / document-qa / experimental / niket_eval.py View on Github external
corpus.get_resource_loader(), checkpoint, args.ema)

    print("Choosing threshold")
    e = evaluation["dev"].per_sample
    q_ids = e["question_id"]
    spans = e["spans"]
    confs = e["none_prob"]
    for th in [0, 0.1, 0.15, 0.2, 0.25]:
        score = 0
        none = 0
        for q_id, (start, end), conf in zip(q_ids, spans, confs):
            answer = quid_to_q[q_id].answer
            if conf < th:
                text = " ".join(quid_to_q[q_id].get_context()[start:end + 1])
                if len(answer.answer_text) > 0:
                    score += max(f1_score(a, text) for a in answer.answer_text)
            else:
                none += 1
                if len(answer.answer_text) == 0:
                    score += 1
        print("%s: %.4f (predicted %d (%.4f))" % (str(th), score/len(q_ids), none, none/len(q_ids)))

    # Print the scalar results in a two column table
    for name, evaluation in evaluation.items():
        scalars = evaluation.scalars
        cols = list(sorted(scalars.keys()))
        table = [cols]
        header = ["Metric", ""]
        table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
        print_table([header] + transpose_lists(table))

        if args.official_output is not None: