Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if cluster_dict:
for cluster, entity_id in cluster_dict.items():
# Fill in "1" for positions corresponding to words in entities
# Need offset by 1 to account for @@START@@ token.
entity_types[cluster[0] + 1:cluster[1] + 1 + 1] = 1
# Fill in entity ID
entity_ids[cluster[0] + 1:cluster[1] + 1 + 1] = entity_id
entity_length = (cluster[1] + 1) - cluster[0]
# Fill in mention length
mention_lengths[cluster[0] + 1:cluster[1] + 1 + 1] = np.arange(entity_length, 0, step=-1)
fields['entity_ids'] = SequentialArrayField(entity_ids, dtype=np.int64)
fields['mention_lengths'] = SequentialArrayField(mention_lengths, dtype=np.int64)
fields['entity_types'] = SequentialArrayField(entity_types, dtype=np.uint8)
return Instance(fields)
# TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
# now, but may change later too.
agenda = worlds[0].get_agenda_for_sentence(sentence)
assert agenda, "No agenda found for sentence: %s" % sentence
# agenda_field contains indices into actions.
agenda_field = ListField(
[IndexField(instance_action_ids[action], action_field) for action in agenda]
)
fields["agenda"] = agenda_field
if labels:
labels_field = ListField(
[LabelField(label, label_namespace="denotations") for label in labels]
)
fields["labels"] = labels_field
return Instance(fields)
if self._tokens_per_instance is not None:
all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
tokenized_text = self._tokenizer.tokenize(all_text)
num_tokens = self._tokens_per_instance + 1
tokenized_strings = []
logger.info(u"Creating dataset from all text in file: %s", file_path)
for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
else:
tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]
for tokenized_string in tokenized_strings:
input_field = TextField(tokenized_string[:-1], self._token_indexers)
output_field = TextField(tokenized_string[1:], self._output_indexer)
yield Instance({u'input_tokens': input_field,
u'output_tokens': output_field})
s_contexts = sample_n((s_ent['other_contexts'], 20))
t_contexts = sample_n((t_ent['other_contexts'], 20))
fields['s_ent_context'] = ListField(
[TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
for c in s_contexts]
) if s_contexts else self._empty_list_token_text_field
fields['t_ent_context'] = ListField(
[TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
for c in t_contexts]
) if t_contexts else self._empty_list_token_text_field
# add boolean label (0 = no match, 1 = match)
fields['label'] = BooleanField(label)
return Instance(fields)
def text_to_instance(self, # type: ignore
input_text: str,
label: str = None) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
if self._field_preparator:
input_text = self._field_preparator.transform(self._input, input_text)
input_tokens = self._tokenizer.tokenize(input_text)
fields['tokens'] = TextField(input_tokens, self._token_indexers)
if label:
if self._field_preparator:
label = self._field_preparator.transform(self._gold_label, label)
fields['label'] = LabelField(label)
return Instance(fields)
def tokens_to_lm_instance(tokens: List[Token],
token_indexers: Dict[str, TokenIndexer]):
tokens = list(tokens) # shallow copy
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
input_field = TextField(tokens[:-1], token_indexers)
output_field = TextField(tokens[1:], token_indexers)
return Instance({'input_tokens': input_field,
'output_tokens': output_field})
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
# Shuffle the documents if requested.
ace_instances = self._shuffle_documents(get_dataset_instances(instances, "ace"))
ontonotes_instances = self._shuffle_documents(get_dataset_instances(instances, "ontonotes"))
n_ontonotes = math.floor(len(ace_instances) / 2)
ontonotes_instances = ontonotes_instances[:n_ontonotes]
all_instances = self._shuffle_documents(ace_instances + ontonotes_instances)
hoppers: Dict[Any, List[Instance]] = defaultdict(list)
for instance in all_instances:
# Which hopper do we put this instance in?
instance_type = (instance["metadata"]["dataset"]
if "dataset" in instance["metadata"]
else None)
hoppers[instance_type].append(instance)
# If the hopper is full, yield up the batch and clear it.
if len(hoppers[instance_type]) >= self._batch_size:
yield Batch(hoppers[instance_type])
hoppers[instance_type].clear()
# Deal with leftovers
for remaining in hoppers.values():
if self._granularity == u"3-class":
if int(sentiment) < 2:
sentiment = u"0"
elif int(sentiment) == 2:
sentiment = u"1"
else:
sentiment = u"2"
elif self._granularity == u"2-class":
if int(sentiment) < 2:
sentiment = u"0"
elif int(sentiment) == 2:
return None
else:
sentiment = u"1"
fields[u'label'] = LabelField(sentiment)
return Instance(fields)
return None
rnd = random()
# skip TN
if self._skip_correct and all(x == "CORRECT" for x in detect_tags):
if rnd > self._tn_prob:
return None
# skip TP
else:
if rnd > self._tp_prob:
return None
fields["labels"] = SequenceLabelField(labels, sequence,
label_namespace="labels")
fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
label_namespace="d_tags")
return Instance(fields)
s_contexts = sample_n(s_ent['other_contexts'], 16, 256)
t_contexts = sample_n(t_ent['other_contexts'], 16, 256)
fields['s_ent_context'] = ListField(
[TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
for c in s_contexts]
)
fields['t_ent_context'] = ListField(
[TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
for c in t_contexts]
)
# add boolean label (0 = no match, 1 = match)
fields['label'] = BooleanField(label)
return Instance(fields)