Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def objective(trial):
# type: (optuna.trial.Trial) -> float
model = nn.Sequential(nn.Linear(20, 1), nn.Sigmoid())
learn = Learner(data_bunch, model, metrics=[accuracy], callback_fns=[
partial(FastAIPruningCallback, trial=trial, monitor='valid_loss')
])
learn.fit(1)
return 1.0
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
# train model
callback_fns = [
partial(GradientClipping, clip=10),
partial(GroupMeanLogMAE, snapshot_ensemble=True),
partial(SaveModelCallback, every='epoch', mode='min',
monitor='group_mean_log_mae', name=model_se_str),
partial(WarmRestartsLRScheduler, n_cycles=args.epochs,
lr=(args.lr, args.lr/args.lr_div), mom=(args.mom, 0.95),
cycle_len=args.cycle_len, cycle_mult=args.cycle_mult,
start_epoch=args.start_epoch)
]
learn = Learner(db, model, metrics=[rmse, mae], callback_fns=callback_fns,
wd=args.wd, loss_func=contribs_rmse_loss)
if args.start_epoch > 0: learn.load(model_se_str+f'_{args.start_epoch-1}')
else: learn.load(model_str)
torch.cuda.empty_cache()
if distributed_train: learn = learn.to_distributed(args.local_rank)
learn.fit(args.epochs)
# make predictions
n_val = len(train_df[train_df['molecule_id'].isin(val_mol_ids)])
val_preds = np.zeros((n_val, args.epochs))
test_preds = np.zeros((len(test_df), args.epochs))
for m in range(args.epochs):
print(f'Predicting for model {m}')
learn.load(model_se_str+f'_{m}')
stats = np.array([[v] + m for v,m in zip(recorder.val_losses,recorder.metrics)])
np.save(cache_path/f'metrics_{self.cuda_id}', stats)
def _learner_parallel(learn:Learner):
"Use nn.DataParallel when training and remove when done"
if not torch.cuda.is_available(): warnings.warn('CUDA is not available, check your drivers - training will continue on CPU', ResourceWarning)
learn.callbacks.append(ParallelTrainer(learn))
return learn
def _learner_distributed(learn:Learner, cuda_id:int, cache_dir:PathOrStr='tmp'):
"Put `learn` on distributed training with `cuda_id`."
learn.callbacks.append(DistributedTrainer(learn, cuda_id))
learn.callbacks.append(DistributedRecorder(learn, cuda_id, cache_dir))
return learn
Learner.to_distributed = _learner_distributed
Learner.to_parallel = _learner_parallel
def read_metrics(cache_path:PathOrStr, n_gpus:int, reduce:bool=True):
losses,metrics = [],[]
for i in range(n_gpus):
losses.append(np.load(cache_path/f'losses_{i}.npy')[None])
metrics.append(np.load(cache_path/f'metrics_{i}.npy')[None])
if reduce:
losses,metrics = np.concatenate(losses,0),np.concatenate(metrics,0)
return losses.mean(0),metrics.mean(0)
return losses,metrics
def setup_distrib(gpu:Any=None):
if gpu is None: return gpu
gpu = int(gpu)
torch.cuda.set_device(int(gpu))
)
# initialize distributed
if distributed_train:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
# train model
callback_fns = [
partial(GradientClipping, clip=10), GroupMeanLogMAE,
partial(SaveModelCallback, every='improvement', mode='min',
monitor='group_mean_log_mae', name=model_str)
]
learn = Learner(db, model, metrics=[rmse, mae], callback_fns=callback_fns,
wd=args.wd, loss_func=contribs_rmse_loss)
if args.start_epoch > 0:
learn.load(model_str)
torch.cuda.empty_cache()
if distributed_train: learn = learn.to_distributed(args.local_rank)
learn.fit_one_cycle(args.epochs, max_lr=args.lr, start_epoch=args.start_epoch)
# make predictions
val_contrib_preds = learn.get_preds(DatasetType.Valid)
test_contrib_preds = learn.get_preds(DatasetType.Test)
val_preds = val_contrib_preds[0][:,-1].detach().numpy() * C.SC_STD + C.SC_MEAN
test_preds = test_contrib_preds[0][:,-1].detach().numpy() * C.SC_STD + C.SC_MEAN
np.save(cache_path/f'metrics_{self.cuda_id}', stats)
def _learner_parallel(learn:Learner):
"Use nn.DataParallel when training and remove when done"
if not torch.cuda.is_available(): warnings.warn('CUDA is not available, check your drivers - training will continue on CPU', ResourceWarning)
learn.callbacks.append(ParallelTrainer(learn))
return learn
def _learner_distributed(learn:Learner, cuda_id:int, cache_dir:PathOrStr='tmp'):
"Put `learn` on distributed training with `cuda_id`."
learn.callbacks.append(DistributedTrainer(learn, cuda_id))
learn.callbacks.append(DistributedRecorder(learn, cuda_id, cache_dir))
return learn
Learner.to_distributed = _learner_distributed
Learner.to_parallel = _learner_parallel
def read_metrics(cache_path:PathOrStr, n_gpus:int, reduce:bool=True):
losses,metrics = [],[]
for i in range(n_gpus):
losses.append(np.load(cache_path/f'losses_{i}.npy')[None])
metrics.append(np.load(cache_path/f'metrics_{i}.npy')[None])
if reduce:
losses,metrics = np.concatenate(losses,0),np.concatenate(metrics,0)
return losses.mean(0),metrics.mean(0)
return losses,metrics
def setup_distrib(gpu:Any=None):
if gpu is None: return gpu
gpu = int(gpu)
torch.cuda.set_device(int(gpu))
if num_distrib() > 1: