Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
to_predict: A python list of text (str) to be sent to the model for prediction.
Returns:
preds: A Python list of lists with dicts containg each word mapped to its NER tag.
model_outputs: A python list of the raw model outputs for each text.
"""
tokenizer = self.tokenizer
device = self.device
model = self.model
args = self.args
pad_token_label_id = self.pad_token_label_id
self._move_model_to_device()
predict_examples = [InputExample(i, sentence.split(), ["O" for word in sentence.split()]) for i, sentence in enumerate(to_predict)]
eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples)
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"])
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
model.eval()
for batch in tqdm(eval_dataloader, disable=args['silent']):
batch = tuple(t.to(device) for t in batch)
with torch.no_grad():
def read_examples_from_file(data_file, mode):
file_path = data_file
guid_index = 1
examples = []
with open(file_path, encoding="utf-8") as f:
words = []
labels = []
for line in f:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
if words:
examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
words=words,
labels=labels))
guid_index += 1
words = []
labels = []
else:
splits = line.split(" ")
words.append(splits[0])
if len(splits) > 1:
labels.append(splits[-1].replace("\n", ""))
else:
# Examples could have no label for mode = "test"
labels.append("O")
if words:
examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
words=words,
if args['model_type'] in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
prefix = 'test'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
if args['model_type'] in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
if args['model_type'] in ['xlnet', 'xlm']:
answers = get_best_predictions_extended(examples, features, all_results, n_best_size,
args['max_answer_length'], model.config.start_n_top, model.config.end_n_top, True, tokenizer, args['null_score_diff_threshold'])
else:
answers = get_best_predictions(examples, features, all_results, n_best_size, args['max_answer_length'], False, False, True, False)
return answers
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
prefix = 'test'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
example_indices = batch[3]
if args['model_type'] in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
prefix = 'test'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
if args['model_type'] in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
if args['model_type'] in ['xlnet', 'xlm']:
answers = get_best_predictions_extended(examples, features, all_results, n_best_size,
args['max_answer_length'], model.config.start_n_top, model.config.end_n_top, True, tokenizer, args['null_score_diff_threshold'])
else:
answers = get_best_predictions(examples, features, all_results, n_best_size, args['max_answer_length'], False, False, True, False)
return answers
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
prefix = 'test'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
all_predictions, all_nbest_json, scores_diff_json = write_predictions_extended(examples, features, all_results, args['n_best_size'],
args['max_answer_length'], output_prediction_file,
output_nbest_file, output_null_log_odds_file, eval_data,
if args['model_type'] in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
prefix = 'test'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
if args['model_type'] in ['xlnet', 'xlm']:
example_indices = batch[3]
if args['model_type'] in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[4],
'p_mask': batch[5]})
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
if args['model_type'] in ['xlnet', 'xlm']:
# XLNet uses a more complex post-processing procedure
result = RawResultExtended(unique_id=unique_id,
start_top_log_probs=to_list(outputs[0][i]),
start_top_index=to_list(outputs[1][i]),
end_top_log_probs=to_list(outputs[2][i]),
end_top_index=to_list(outputs[3][i]),
cls_logits=to_list(outputs[4][i]))
else:
result = RawResult(unique_id=unique_id,
start_logits=to_list(outputs[0][i]),
end_logits=to_list(outputs[1][i]))
all_results.append(result)
prefix = 'test'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))