Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Zero-pad up to the sequence length.
while len(input_ids) < self._max_sequence_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == self._max_sequence_length
assert len(input_mask) == self._max_sequence_length
assert len(segment_ids) == self._max_sequence_length
input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
segment_ids_tensor = torch.tensor(segment_ids, dtype=torch.long)
instance = Instance({"input_ids": MetadataField(input_ids_tensor),
"token_type_ids": MetadataField(segment_ids_tensor),
"attention_mask": MetadataField(input_mask_tensor),
"tokens": MetadataField(tokens),
"document_tokens": MetadataField(doc_tokens),
"token_to_original_map": MetadataField(token_to_orig_map),
"token_is_max_context": MetadataField(token_is_max_context)})
# We truncate the original doc to defined max_sequence_length.
# Here we only process the first part of doc_spans and return the result.
return instance
def text_to_instance(
self, # type: ignore
tokens: List[str],
pos_tags: List[str] = None,
arc_indices: List[Tuple[int, int]] = None,
arc_tags: List[str] = None,
) -> Instance:
fields: Dict[str, Field] = {}
token_field = TextField([Token(t) for t in tokens], self._token_indexers)
fields["tokens"] = token_field
fields["metadata"] = MetadataField({"tokens": tokens})
if pos_tags is not None:
fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos")
if arc_indices is not None and arc_tags is not None:
fields["arc_tags"] = AdjacencyField(arc_indices, token_field, arc_tags)
return Instance(fields)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == self._max_sequence_length
assert len(input_mask) == self._max_sequence_length
assert len(segment_ids) == self._max_sequence_length
input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
segment_ids_tensor = torch.tensor(segment_ids, dtype=torch.long)
instance = Instance({"input_ids": MetadataField(input_ids_tensor),
"token_type_ids": MetadataField(segment_ids_tensor),
"attention_mask": MetadataField(input_mask_tensor),
"tokens": MetadataField(tokens),
"document_tokens": MetadataField(doc_tokens),
"token_to_original_map": MetadataField(token_to_orig_map),
"token_is_max_context": MetadataField(token_is_max_context)})
# We truncate the original doc to defined max_sequence_length.
# Here we only process the first part of doc_spans and return the result.
return instance
fields['question'] = TextField(question_tokens, token_indexers)
fields['span_start'] = ListField([IndexField(span_start, passage_field) for span_start in span_starts])
# TODO: Consider convert `answer_texts` and `answer_labels` to `Field` type.
metadata = {
'original_passage': passage_text,
'token_offsets': passage_offsets,
'sentence_starts': sentence_starts,
'question_tokens': [token.text for token in question_tokens],
'passage_tokens': [token.text for token in passage_tokens],
'answer_texts': answer_texts,
'answer_labels': answer_labels
}
metadata.update(additional_metadata)
fields['metadata'] = MetadataField(metadata)
return Instance(fields)
if not expression:
expression.append(3 * [-1])
expression_indices.append(ArrayField(np.array(expression), padding_value=-1))
if not expression_indices:
expression_indices = \
[ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))]
fields["answer_as_expressions"] = ListField(expression_indices)
count_fields: List[Field] = [LabelField(count_label, skip_indexing=True) for count_label in valid_counts]
if not count_fields:
count_fields.append(LabelField(-1, skip_indexing=True))
fields["answer_as_counts"] = ListField(count_fields)
fields["num_spans"] = LabelField(num_spans, skip_indexing=True)
fields["metadata"] = MetadataField(metadata)
return Instance(fields)
def text_to_instance(self, # type: ignore
tokens ,
pos_tags = None,
chunk_tags = None,
ner_tags = None) :
u"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
# pylint: disable=arguments-differ
sequence = TextField(tokens, self._token_indexers)
instance_fields = {u'tokens': sequence}
instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
# Recode the labels if necessary.
if self.coding_scheme == u"BIOUL":
coded_chunks = to_bioul(chunk_tags) if chunk_tags is not None else None
coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
else:
# the default IOB1
coded_chunks = chunk_tags
coded_ner = ner_tags
# Add "feature labels" to instance
if u'pos' in self.feature_labels:
if pos_tags is None:
raise ConfigurationError(u"Dataset reader was specified to use pos_tags as "
u"features. Pass them to text_to_instance.")
instance_fields[u'pos_tags'] = SequenceLabelField(pos_tags, sequence, u"pos_tags")
)
if not add_sub_signs_field:
add_sub_signs_field.append(
SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)
)
fields["answer_as_add_sub_expressions"] = ListField(add_sub_signs_field)
count_fields: List[Field] = [
LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"]
]
if not count_fields:
count_fields.append(LabelField(-1, skip_indexing=True))
fields["answer_as_counts"] = ListField(count_fields)
metadata.update(additional_metadata)
fields["metadata"] = MetadataField(metadata)
return Instance(fields)
intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
# print([t.text for t in context_tokens])
fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
fields["tokens"] = TextField(tokens, self._token_indexers)
fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
if tags is not None:
fields["tags"] = SequenceLabelField(tags, fields["tokens"])
if intents is not None:
fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
if dialog_act is not None:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
'dialog_act': dialog_act})
else:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
return Instance(fields)
action_sequence_fields: List[Field] = []
for logical_form in logical_forms:
expression = world.parse_logical_form(logical_form)
action_sequence = world.get_action_sequence(expression)
try:
index_fields: List[Field] = []
for production_rule in action_sequence:
index_fields.append(IndexField(action_map[production_rule], action_field))
action_sequence_fields.append(ListField(index_fields))
except KeyError as error:
logger.info(f"Missing production rule: {error.args}, skipping logical form")
logger.info(f"Question was: {question}")
logger.info(f"Logical form was: {logical_form}")
continue
fields["target_action_sequences"] = ListField(action_sequence_fields)
fields["metadata"] = MetadataField(additional_metadata or {})
return Instance(fields)
)
if not add_sub_signs_field:
add_sub_signs_field.append(
SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)
)
fields["answer_as_add_sub_expressions"] = ListField(add_sub_signs_field)
count_fields: List[Field] = [
LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"]
]
if not count_fields:
count_fields.append(LabelField(-1, skip_indexing=True))
fields["answer_as_counts"] = ListField(count_fields)
metadata.update(additional_metadata)
fields["metadata"] = MetadataField(metadata)
return Instance(fields)