Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _squad_answer_detector(paragraph: Paragraph,
normalized_text: List[str],
tagged_spans: ParagraphSpans):
correct_answer_text = [normalize_answer(x.text) for x in tagged_spans]
answer_spans = []
n_words = paragraph.n_context_words
for ix in range(n_words):
word = normalized_text[ix]
for answer in correct_answer_text:
span_text = word
end_ix = ix
any_found = False
while True:
if span_text == answer:
answer_spans.append((ix, end_ix))
# continue in case the span including the next token also matches when normalized
elif not answer.startswith(span_text):
break
end_ix += 1
span_text = word
end_ix = ix
any_found = False
while True:
if span_text == answer:
answer_spans.append((ix, end_ix))
# continue in case the span including the next token also matches when normalized
elif not answer.startswith(span_text):
break
end_ix += 1
if end_ix == n_words:
break
next_token = normalized_text[end_ix]
if next_token not in answer[len(span_text):]:
break
span_text = normalize_answer(paragraph.get_original_text(ix, end_ix))
if any_found:
answer_spans.append((ix, end_ix))
break
for x in tagged_spans:
start, end = x.para_word_start, x.para_word_end
if (start, end) not in answer_spans:
extracted = normalize_answer(paragraph.get_original_text(start, end))
if any(extracted == x for x in correct_answer_text):
raise RuntimeError("Missed an answer span!") # Sanity check, we should have extracted this
else:
# normally due to the correct text being cut off mid word, or otherwise text that does not
# land between our tokens
# in this case we will just include the tagged span as training data anyway
answer_spans.append((start, end))
def preprocess(data: List[Document]) -> List[DocParagraphAndQuestion]:
out = []
for doc in data:
for paragraph in doc.paragraphs:
# Precomputed once per each paragraph to speed up our answer detection algorithm
normalized_text = [normalize_answer(paragraph.get_original_text(i, i))
for i in range(paragraph.n_context_words)]
for q in paragraph.questions:
spans = _squad_answer_detector(paragraph, normalized_text, q.answer)
out.append(DocParagraphAndQuestion(q.words, TokenSpans([x.text for x in q.answer], spans), q.question_id, paragraph))
return out
break
end_ix += 1
if end_ix == n_words:
break
next_token = normalized_text[end_ix]
if next_token not in answer[len(span_text):]:
break
span_text = normalize_answer(paragraph.get_original_text(ix, end_ix))
if any_found:
answer_spans.append((ix, end_ix))
break
for x in tagged_spans:
start, end = x.para_word_start, x.para_word_end
if (start, end) not in answer_spans:
extracted = normalize_answer(paragraph.get_original_text(start, end))
if any(extracted == x for x in correct_answer_text):
raise RuntimeError("Missed an answer span!") # Sanity check, we should have extracted this
else:
# normally due to the correct text being cut off mid word, or otherwise text that does not
# land between our tokens
# in this case we will just include the tagged span as training data anyway
answer_spans.append((start, end))
if len(answer_spans) == 0:
raise ValueError()
else:
return np.array(answer_spans, dtype=np.int32)