Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def text_to_instance(self, nc: str) -> Instance:
tokenized_nc = self._tokenizer.tokenize(nc)
nc_field = TextField(tokenized_nc, self._token_indexers)
w1_field, w2_field, nc_seq_field = nc_field, nc_field, nc_field
constituents = nc.split('_')
if len(constituents) == 2:
w1, w2 = constituents
tokenized_w1 = self._tokenizer.tokenize(w1)
w1_field = TextField(tokenized_w1, self._token_indexers)
tokenized_w2 = self._tokenizer.tokenize(w2)
w2_field = TextField(tokenized_w2, self._token_indexers)
tokenized_nc_seq = self._tokenizer.tokenize(' '.join((w1, w2)))
nc_seq_field = TextField(tokenized_nc_seq, self._token_indexers)
fields = {'nc': nc_field, 'w1': w1_field, 'w2': w2_field, 'nc_seq': nc_seq_field}
return Instance(fields)
def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None):
instance_dict = {}
if isinstance(sent_tokens, str):
sent_tokens = sent_tokens.split()
sent_tokens = cleanse_sentence_text(sent_tokens)
text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers)
instance_dict['text'] = text_field
instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field)
if annotations is not None:
for i, slot_name in enumerate(self._slot_labels):
span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans])
instance_dict['span_slot_%s'%slot_name] = span_slot
labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans])
instance_dict['labeled_spans'] = labeled_span_field
if self._bio_labels:
bio_labels = ["O"] * len(sent_tokens)
bio_labels[pred_index] = "B-V"
def text_to_instance(
self, # type: ignore
tokens: List[Token],
ner_tags: List[str] = None,
) -> Instance:
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
sequence = TextField(tokens, self._token_indexers)
instance_fields: Dict[str, Field] = {"tokens": sequence}
instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
# Add "tag label" to instance
if ner_tags is not None:
if self._coding_scheme == "BIOUL":
ner_tags = to_bioul(ner_tags, encoding="BIO")
instance_fields["tags"] = SequenceLabelField(ner_tags, sequence)
return Instance(instance_fields)
def text_to_instance(self, nc: str, paraphrase: str = None, neg_paraphrase: str = None) -> Instance:
tokenized_nc = self._tokenizer.tokenize(nc)
nc_field = TextField(tokenized_nc, self._token_indexers)
# Remove non-binary NCs to make it comparable to the other composition functions
if nc_field.sequence_length() != 2:
return None
fields = {'nc': nc_field}
# During training, we minimize the distance to the paraphrase
if paraphrase is not None:
tokenized_paraphrase = self._tokenizer.tokenize(paraphrase)
paraphrase_field = TextField(tokenized_paraphrase, self._token_indexers)
fields['paraphrase'] = paraphrase_field
# Negative sampled paraphrase to move away from
tokenized_neg_paraphrase = self._tokenizer.tokenize(neg_paraphrase)
neg_paraphrase_field = TextField(tokenized_neg_paraphrase, self._token_indexers)
fields['neg_paraphrase'] = neg_paraphrase_field
return Instance(fields)
def text_to_instance(self, # type: ignore
tokens: List[str],
lemmas: List[str] = None,
pos_tags: List[str] = None,
gold_actions: List[List[str]] = None,
id: str = None,
amr: str = None,
input: str = None,
mrp: str = None,
companion: str = None) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
token_field = TextField([Token(t) for t in tokens], self._token_indexers)
fields["tokens"] = token_field
meta_dict = {"tokens": tokens}
if id:
meta_dict["id"] = id
if amr:
meta_dict["amr"] = amr
if input:
meta_dict["input"] = input
if mrp:
meta_dict["mrp"] = json.loads(mrp)
if companion:
meta_dict["companion"] = json.loads(companion)
if lemmas is not None and self._lemma_indexers is not None:
fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)
if pos_tags is not None:
def text_to_instance(
self, # type: ignore
candidates: List[str],
query: str,
supports: List[str],
_id: str = None,
answer: str = None,
annotations: List[List[str]] = None,
) -> Instance:
fields: Dict[str, Field] = {}
candidates_field = ListField(
[
TextField(candidate, self._token_indexers)
for candidate in self._tokenizer.batch_tokenize(candidates)
]
)
fields["query"] = TextField(self._tokenizer.tokenize(query), self._token_indexers)
fields["supports"] = ListField(
[
TextField(support, self._token_indexers)
for support in self._tokenizer.batch_tokenize(supports)
]
)
fields["answer"] = TextField(self._tokenizer.tokenize(answer), self._token_indexers)
fields["answer_index"] = IndexField(candidates.index(answer), candidates_field)
def batch_to_ids(batch):
"""
Given a batch (as list of tokenized sentences), return a batch
of padded character ids.
"""
instances = []
for sentence in batch:
tokens = [Token(token) for token in sentence]
field = TextField(tokens, {'character_ids': indexer})
instance = Instance({"elmo": field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary()
dataset.index_instances(vocab)
return dataset.as_tensor_dict()['elmo']['character_ids']
def text_to_instance(self, nc: str) -> Instance:
nc = nc.replace('_', ' ')
tokenized_nc = self._tokenizer.tokenize(nc)
nc_field = TextField(tokenized_nc, self._token_indexers)
fields = {'nc': nc_field}
return Instance(fields)
def tokens_to_lm_instance(tokens: List[Token],
token_indexers: Dict[str, TokenIndexer]):
tokens = list(tokens) # shallow copy
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
input_field = TextField(tokens[:-1], token_indexers)
output_field = TextField(tokens[1:], token_indexers)
return Instance({'input_tokens': input_field,
'output_tokens': output_field})
instance_strings = text_file.readlines()
if self._tokens_per_instance is not None:
all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
tokenized_text = self._tokenizer.tokenize(all_text)
num_tokens = self._tokens_per_instance + 1
tokenized_strings = []
logger.info(u"Creating dataset from all text in file: %s", file_path)
for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
else:
tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]
for tokenized_string in tokenized_strings:
input_field = TextField(tokenized_string[:-1], self._token_indexers)
output_field = TextField(tokenized_string[1:], self._output_indexer)
yield Instance({u'input_tokens': input_field,
u'output_tokens': output_field})