Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Evaluate on test set using model with best validation score
info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}')
model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger)
if args.split_test_by_overlap_dataset is not None:
overlap_data = get_data(path=args.split_test_by_overlap_dataset, logger=logger)
overlap_smiles = set(overlap_data.smiles())
test_data_intersect, test_data_nonintersect = [], []
for d in test_data.data:
if d.smiles in overlap_smiles:
test_data_intersect.append(d)
else:
test_data_nonintersect.append(d)
test_data_intersect, test_data_nonintersect = MoleculeDataset(test_data_intersect), MoleculeDataset(test_data_nonintersect)
for name, td in [('Intersect', test_data_intersect), ('Nonintersect', test_data_nonintersect)]:
test_preds = predict(
model=model,
data=td,
args=args,
scaler=scaler,
logger=logger
)
test_scores = evaluate_predictions(
preds=test_preds,
targets=td.targets(),
metric_func=metric_func,
dataset_type=args.dataset_type,
args=args,
logger=logger
)
avg_test_score = np.nanmean(test_scores)
info(f'Model {model_idx} test {args.metric} for {name} = {avg_test_score:.6f}')
args: Namespace,
scaler: StandardScaler = None,
logger: logging.Logger = None) -> List[float]:
"""
Evaluates an ensemble of models on a dataset.
:param model: A model.
:param data: A MoleculeDataset.
:param metric_func: Metric function which takes in a list of targets and a list of predictions.
:param dataset_type: Dataset type.
:param args: Arguments.
:param scaler: A StandardScaler object fit on the training targets.
:param logger: Logger.
:return: A list with the score for each task based on `metric_func`.
"""
preds = predict(
model=model,
data=data,
args=args,
scaler=scaler,
bert_save_memory=True,
logger=logger
)
if args.maml:
preds, targets = preds # in this case the targets are determined by the tasks sampled during prediction
else:
targets = data.targets()
if args.dataset_type == 'bert_pretraining':
# Only predict targets that are masked out
targets['vocab'] = [target if mask == 0 else None for target, mask in zip(targets['vocab'], data.mask())]
)
test_scores = evaluate_predictions(
preds=test_preds,
targets=td.targets(),
metric_func=metric_func,
dataset_type=args.dataset_type,
args=args,
logger=logger
)
avg_test_score = np.nanmean(test_scores)
info(f'Model {model_idx} test {args.metric} for {name} = {avg_test_score:.6f}')
if len(test_data) == 0: # just get some garbage results without crashing; in this case we didn't care anyway
test_preds, test_scores = sum_test_preds, [0 for _ in range(len(args.task_names))]
else:
test_preds = predict(
model=model,
data=test_data,
args=args,
scaler=scaler,
logger=logger
)
test_scores = evaluate_predictions(
preds=test_preds,
targets=test_targets,
metric_func=metric_func,
dataset_type=args.dataset_type,
args=args,
logger=logger
)
if args.maml:
if args.compound_names:
compound_names = test_data.compound_names()
print(f'Test size = {len(test_data):,}')
# Normalize features
if train_args.features_scaling:
test_data.normalize_features(features_scaler)
# Predict with each model individually and sum predictions
sum_preds = np.zeros((len(test_data), args.num_tasks))
print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
# Load model
model = load_checkpoint(checkpoint_path, cuda=args.cuda)
model_preds = predict(
model=model,
data=test_data,
args=args,
scaler=scaler
)
sum_preds += np.array(model_preds)
# Ensemble predictions
avg_preds = sum_preds / args.ensemble_size
avg_preds = avg_preds.tolist()
# Save predictions
assert len(test_data) == len(avg_preds)
print(f'Saving predictions to {args.preds_path}')
# Put Nones for invalid smiles