How to use the squad.squad_official_evaluation.normalize_answer function in squad

To help you get started, we’ve selected a few squad examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / document-qa / experimental / squad_text_labels.py View on Github external
def _squad_answer_detector(paragraph: Paragraph,
                          normalized_text: List[str],
                          tagged_spans: ParagraphSpans):
    correct_answer_text = [normalize_answer(x.text) for x in tagged_spans]

    answer_spans = []
    n_words = paragraph.n_context_words
    for ix in range(n_words):
        word = normalized_text[ix]
        for answer in correct_answer_text:
            span_text = word
            end_ix = ix
            any_found = False
            while True:
                if span_text == answer:
                    answer_spans.append((ix, end_ix))
                    # continue in case the span including the next token also matches when normalized
                elif not answer.startswith(span_text):
                    break
                end_ix += 1
github allenai / document-qa / experimental / squad_text_labels.py View on Github external
span_text = word
            end_ix = ix
            any_found = False
            while True:
                if span_text == answer:
                    answer_spans.append((ix, end_ix))
                    # continue in case the span including the next token also matches when normalized
                elif not answer.startswith(span_text):
                    break
                end_ix += 1
                if end_ix == n_words:
                    break
                next_token = normalized_text[end_ix]
                if next_token not in answer[len(span_text):]:
                    break
                span_text = normalize_answer(paragraph.get_original_text(ix, end_ix))
            if any_found:
                answer_spans.append((ix, end_ix))
                break

    for x in tagged_spans:
        start, end = x.para_word_start, x.para_word_end
        if (start, end) not in answer_spans:
            extracted = normalize_answer(paragraph.get_original_text(start, end))
            if any(extracted == x for x in correct_answer_text):
                raise RuntimeError("Missed an answer span!")  # Sanity check, we should have extracted this
            else:
                # normally due to the correct text being cut off mid word, or otherwise text that does not
                # land between our tokens
                # in this case we will just include the tagged span as training data anyway
                answer_spans.append((start, end))
github allenai / document-qa / experimental / squad_text_labels.py View on Github external
def preprocess(data: List[Document]) -> List[DocParagraphAndQuestion]:
    out = []
    for doc in data:
        for paragraph in doc.paragraphs:
            # Precomputed once per each paragraph to speed up our answer detection algorithm
            normalized_text = [normalize_answer(paragraph.get_original_text(i, i))
                               for i in range(paragraph.n_context_words)]
            for q in paragraph.questions:
                spans = _squad_answer_detector(paragraph, normalized_text, q.answer)
                out.append(DocParagraphAndQuestion(q.words, TokenSpans([x.text for x in q.answer], spans), q.question_id, paragraph))
    return out
github allenai / document-qa / experimental / squad_text_labels.py View on Github external
break
                end_ix += 1
                if end_ix == n_words:
                    break
                next_token = normalized_text[end_ix]
                if next_token not in answer[len(span_text):]:
                    break
                span_text = normalize_answer(paragraph.get_original_text(ix, end_ix))
            if any_found:
                answer_spans.append((ix, end_ix))
                break

    for x in tagged_spans:
        start, end = x.para_word_start, x.para_word_end
        if (start, end) not in answer_spans:
            extracted = normalize_answer(paragraph.get_original_text(start, end))
            if any(extracted == x for x in correct_answer_text):
                raise RuntimeError("Missed an answer span!")  # Sanity check, we should have extracted this
            else:
                # normally due to the correct text being cut off mid word, or otherwise text that does not
                # land between our tokens
                # in this case we will just include the tagged span as training data anyway
                answer_spans.append((start, end))

    if len(answer_spans) == 0:
        raise ValueError()
    else:
        return np.array(answer_spans, dtype=np.int32)