Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
num_iters = len(data) if args.last_batch else len(data) // args.batch_size * args.batch_size
if args.parallel_featurization:
batch_queue = Queue(args.batch_queue_max_size)
exit_queue = Queue(1)
batch_process = Process(target=async_mol2graph, args=(batch_queue, data, args, num_iters, args.batch_size, exit_queue, args.last_batch))
batch_process.start()
currently_loaded_batches = []
iter_size = 1 if args.maml else args.batch_size
for i in trange(0, num_iters, iter_size):
if args.moe:
if not args.batch_domain_encs:
model.compute_domain_encs(train_smiles) # want to recompute every batch
mol_batch = [MoleculeDataset(d[i:i + args.batch_size]) for d in data]
train_batch, train_targets = [], []
for b in mol_batch:
tb, tt = b.smiles(), b.targets()
train_batch.append(tb)
train_targets.append(tt)
test_batch = test_smiles[i:i + args.batch_size]
loss = model.compute_loss(train_batch, train_targets, test_batch)
model.zero_grad()
loss_sum += loss.item()
iter_count += len(mol_batch)
elif args.maml:
task_train_data, task_test_data, task_idx = data.sample_maml_task(args)
mol_batch = task_test_data
smiles_batch, features_batch, target_batch = task_train_data.smiles(), task_train_data.features(), task_train_data.targets(task_idx)
# no mask since we only picked data points that have the desired target
def build_lr_scheduler(optimizer: Optimizer, args: Namespace, total_epochs: List[int] = None) -> _LRScheduler:
"""
Builds a learning rate scheduler.
:param optimizer: The Optimizer whose learning rate will be scheduled.
:param args: Arguments.
:return: An initialized learning rate scheduler.
"""
# Learning rate scheduler
if args.scheduler == 'noam':
return NoamLR(
optimizer=optimizer,
warmup_epochs=args.warmup_epochs,
total_epochs=total_epochs or [args.epochs] * args.num_lrs,
steps_per_epoch=args.train_data_size // args.batch_size,
init_lr=args.init_lr,
max_lr=args.max_lr,
final_lr=args.final_lr
)
if args.scheduler == 'none':
return MockLR(optimizer=optimizer, lr=args.init_lr)
if args.scheduler == 'decay':
return ExponentialLR(optimizer, args.lr_decay_rate)
raise ValueError(f'Learning rate scheduler "{args.scheduler}" not supported.')
self.act_func,
nn.Linear(self.hidden_size, self.hidden_size),
self.act_func,
nn.Linear(self.hidden_size, self.hidden_size),
self.act_func,
nn.Linear(self.hidden_size, 1)
)
self.beta = args.wgan_beta
# the optimizers don't really belong here, but we put it here so that we don't clutter code for other opts
self.optimizerG = Adam(self.encoder.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))
self.optimizerD = Adam(self.netD.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))
self.use_scheduler = args.gan_use_scheduler
if self.use_scheduler:
self.schedulerG = NoamLR(
self.optimizerG,
warmup_epochs=args.warmup_epochs,
total_epochs=args.epochs,
steps_per_epoch=args.train_data_length // args.batch_size,
init_lr=args.init_lr[0] * args.gan_lr_mult,
max_lr=args.max_lr[0] * args.gan_lr_mult,
final_lr=args.final_lr[0] * args.gan_lr_mult
)
self.schedulerD = NoamLR(
self.optimizerD,
warmup_epochs=args.warmup_epochs,
total_epochs=args.epochs,
steps_per_epoch=(args.train_data_length // args.batch_size) * args.gan_d_per_g,
init_lr=args.init_lr[0] * args.gan_lr_mult,
max_lr=args.max_lr[0] * args.gan_lr_mult,
final_lr=args.final_lr[0] * args.gan_lr_mult
# the optimizers don't really belong here, but we put it here so that we don't clutter code for other opts
self.optimizerG = Adam(self.encoder.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))
self.optimizerD = Adam(self.netD.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))
self.use_scheduler = args.gan_use_scheduler
if self.use_scheduler:
self.schedulerG = NoamLR(
self.optimizerG,
warmup_epochs=args.warmup_epochs,
total_epochs=args.epochs,
steps_per_epoch=args.train_data_length // args.batch_size,
init_lr=args.init_lr[0] * args.gan_lr_mult,
max_lr=args.max_lr[0] * args.gan_lr_mult,
final_lr=args.final_lr[0] * args.gan_lr_mult
)
self.schedulerD = NoamLR(
self.optimizerD,
warmup_epochs=args.warmup_epochs,
total_epochs=args.epochs,
steps_per_epoch=(args.train_data_length // args.batch_size) * args.gan_d_per_g,
init_lr=args.init_lr[0] * args.gan_lr_mult,
max_lr=args.max_lr[0] * args.gan_lr_mult,
final_lr=args.final_lr[0] * args.gan_lr_mult
)
debug(f'Number of tasks = {args.num_tasks}')
if args.dataset_type == 'bert_pretraining':
data.bert_init(args, logger)
# Split data
if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set
data, bin_predictions, regression_data = data
args.bin_predictions = bin_predictions
debug(f'Splitting data with seed {args.seed}')
train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
_, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
else:
debug(f'Splitting data with seed {args.seed}')
if args.separate_test_set:
test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger)
if args.separate_val_set:
val_data = get_data(path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger)
train_data = data # nothing to split; we already got our test and val sets
else:
train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
else:
train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
# Optionally replace test data with train or val data
if args.test_split == 'train':
test_data = train_data
elif args.test_split == 'val':
test_data = val_data
if args.dataset_type == 'classification':
class_sizes = get_class_sizes(data)
if args.dataset_type == 'bert_pretraining':
data.bert_init(args, logger)
# Split data
if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set
data, bin_predictions, regression_data = data
args.bin_predictions = bin_predictions
debug(f'Splitting data with seed {args.seed}')
train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
_, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
else:
debug(f'Splitting data with seed {args.seed}')
if args.separate_test_set:
test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger)
if args.separate_val_set:
val_data = get_data(path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger)
train_data = data # nothing to split; we already got our test and val sets
else:
train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
else:
train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
# Optionally replace test data with train or val data
if args.test_split == 'train':
test_data = train_data
elif args.test_split == 'val':
test_data = val_data
if args.dataset_type == 'classification':
class_sizes = get_class_sizes(data)
debug('Class sizes')
for i, task_class_sizes in enumerate(class_sizes):
debug, info = logger.debug, logger.info
else:
debug = info = print
# Set GPU
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
# Print args
debug(pformat(vars(args)))
# Get data
debug('Loading data')
args.task_names = get_task_names(args.data_path)
desired_labels = get_desired_labels(args, args.task_names)
data = get_data(path=args.data_path, args=args, logger=logger)
args.num_tasks = data.num_tasks()
args.features_size = data.features_size()
args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks
debug(f'Number of tasks = {args.num_tasks}')
if args.dataset_type == 'bert_pretraining':
data.bert_init(args, logger)
# Split data
if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set
data, bin_predictions, regression_data = data
args.bin_predictions = bin_predictions
debug(f'Splitting data with seed {args.seed}')
train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
_, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
else:
elif args.dataset_type == 'classification':
model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1)
else:
raise ValueError(f'dataset_type "{args.dataset_type}" not supported.')
train_targets = train_data.targets()
if train_data.num_tasks() == 1:
train_targets = [targets[0] for targets in train_targets]
model.fit(train_data.features(), train_targets)
test_preds = model.predict(test_data.features())
if train_data.num_tasks() == 1:
test_preds = [[pred] for pred in test_preds]
scores = evaluate_predictions(
preds=test_preds,
targets=test_data.targets(),
metric_func=metric_func,
dataset_type=args.dataset_type
)
return scores
if args.dataset_type == 'regression':
model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1)
elif args.dataset_type == 'classification':
model = RandomForestClassifier(class_weight=args.class_weight, n_estimators=args.num_trees, n_jobs=-1)
else:
raise ValueError(f'dataset_type "{args.dataset_type}" not supported.')
model.fit(train_features, train_targets)
test_preds = model.predict(test_features)
test_preds = [[pred] for pred in test_preds]
test_targets = [[target] for target in test_targets]
score = evaluate_predictions(
preds=test_preds,
targets=test_targets,
metric_func=metric_func,
dataset_type=args.dataset_type
)
scores.append(score[0])
return scores
parser.add_argument('--train_path', type=str, required=True,
help='Path to CSV file containing training data in chronological order')
parser.add_argument('--val_path', type=str, required=True,
help='Path to CSV file containing val data in chronological order')
parser.add_argument('--train_save', type=str, required=True,
help='Path to CSV file for new train data')
parser.add_argument('--val_save', type=str, required=True,
help='Path to CSV file for new val data')
parser.add_argument('--val_frac', type=float, default=0.2,
help='frac of data to use for validation')
parser.add_argument('--train_val_save', type=str, required=True,
help='Path to CSV file for combined train and val data')
args = parser.parse_args()
set_logger(logger, args.save_dir, args.quiet)
modify_train_args(args)
modify_hyper_opt_args(args)
# Preprocess train and validation data
resplit(args)
merge_train_val(args)
for path in [args.train_save, args.val_save, args.train_val_save]:
args.data_path = path
args.save_path = path
average_duplicates(args)
# Optimize hyperparameters
args.data_path = args.train_save
args.separate_test_set = args.val_save
optimize_hyperparameters(args)
# Determine best hyperparameters, update args, and train