Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# account for special tokens (CLS, SEP, SEP..) in lm_label_ids
lm_label_ids = insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"lm_label_ids": lm_label_ids,
}
if next_sent_pred:
# This mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left)
if label_ids:
label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"initial_mask": initial_mask,
}
if label_ids:
feature_dict[label_tensor_name] = label_ids
return [feature_dict]
label_ids = None
logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
"\nIf your are running in *inference* mode: Don't worry!"
"\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
# This mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left)
if label_ids:
label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"initial_mask": initial_mask,
}
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
assert len(input_ids) == max_seq_len
assert len(padding_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
feat_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
}
# Add Labels for different tasks
for task_name, task in tasks.items():
try:
label_name = task["label_name"]
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left)
if label_ids:
label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"initial_mask": initial_mask,
}
if label_ids:
feature_dict[label_tensor_name] = label_ids
return [feature_dict]
input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
assert len(input_ids) == max_seq_len
assert len(padding_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
feat_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
}
# Add Labels for different tasks
# Zero-pad up to the sequence length.
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"lm_label_ids": lm_label_ids,
}
if next_sent_pred:
feature_dict["nextsentence_label_ids"] = is_next_label_id
assert len(input_ids) == max_seq_len
assert len(padding_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
assert len(lm_label_ids) == max_seq_len
"\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
# This mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left)
if label_ids:
label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"initial_mask": initial_mask,
}
if label_ids:
feature_dict[label_tensor_name] = label_ids
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left)
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"lm_label_ids": lm_label_ids,
}
if next_sent_pred:
feature_dict["nextsentence_label_ids"] = is_next_label_id
assert len(input_ids) == max_seq_len
assert len(padding_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
truncation_strategy='do_not_truncate' # We've already truncated our tokens before
)
input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# Padding up to the sequence length.
# Normal case: adding multiple 0 to the right
# Special cases:
# a) xlnet pads on the left and uses "4" for padding token_type_ids
if tokenizer.__class__.__name__ == "XLNetTokenizer":
pad_on_left = True
segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
else:
pad_on_left = False
segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)
input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
assert len(input_ids) == max_seq_len
assert len(padding_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
feat_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,