Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data = get_data(path=args.data_path, args=args, logger=logger)
args.num_tasks = data.num_tasks()
args.features_size = data.features_size()
args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks
debug(f'Number of tasks = {args.num_tasks}')
if args.dataset_type == 'bert_pretraining':
data.bert_init(args, logger)
# Split data
if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set
data, bin_predictions, regression_data = data
args.bin_predictions = bin_predictions
debug(f'Splitting data with seed {args.seed}')
train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
_, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
else:
debug(f'Splitting data with seed {args.seed}')
if args.separate_test_set:
test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger)
if args.separate_val_set:
val_data = get_data(path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger)
train_data = data # nothing to split; we already got our test and val sets
else:
train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
else:
train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
# Optionally replace test data with train or val data
if args.test_split == 'train':
test_data = train_data
elif args.test_split == 'val':
# Split data
if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set
data, bin_predictions, regression_data = data
args.bin_predictions = bin_predictions
debug(f'Splitting data with seed {args.seed}')
train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
_, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
else:
debug(f'Splitting data with seed {args.seed}')
if args.separate_test_set:
test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger)
if args.separate_val_set:
val_data = get_data(path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger)
train_data = data # nothing to split; we already got our test and val sets
else:
train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
else:
train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
# Optionally replace test data with train or val data
if args.test_split == 'train':
test_data = train_data
elif args.test_split == 'val':
test_data = val_data
if args.dataset_type == 'classification':
class_sizes = get_class_sizes(data)
debug('Class sizes')
for i, task_class_sizes in enumerate(class_sizes):
debug(f'{args.task_names[i]} '
f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}')
def run_random_forest(args: Namespace, logger: Logger = None) -> List[float]:
if logger is not None:
debug, info = logger.debug, logger.info
else:
debug = info = print
debug(pformat(vars(args)))
metric_func = get_metric_func(args.metric)
debug('Loading data')
data = get_data(path=args.data_path)
debug(f'Splitting data with seed {args.seed}')
# Need to have val set so that train and test sets are the same as when doing MPN
train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed)
debug(f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}')
debug('Computing morgan fingerprints')
for dataset in [train_data, test_data]:
for datapoint in tqdm(dataset, total=len(dataset)):
datapoint.set_features(morgan_fingerprint(smiles=datapoint.smiles, radius=args.radius, num_bits=args.num_bits))
debug('Training')
if args.single_task:
scores = single_task_random_forest(train_data, test_data, metric_func, args)
else:
scores = multi_task_random_forest(train_data, test_data, metric_func, args)
info(f'Test {args.metric} = {np.nanmean(scores)}')