Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
describe_it = lambda x: describe(torch.cat(x).detach().cpu().numpy().squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
out['Q1'] = describe_it(Q1_vals)
'rank_transform': config['train.rank_transform']})
train_logs = []
checkpoint_count = 0
with Pool(processes=config['train.popsize']//config['train.worker_chunksize']) as pool:
print('Finish initialization. Training starts...')
for generation in range(config['train.generations']):
t0 = time.perf_counter()
solutions = es.ask()
data = [(config, seed, device, solution) for solution in solutions]
out = pool.map(CloudpickleWrapper(fitness), data, chunksize=config['train.worker_chunksize'])
Rs, Hs = zip(*out)
es.tell(solutions, [-R for R in Rs])
logger = Logger()
logger('generation', generation+1)
logger('num_seconds', round(time.perf_counter() - t0, 1))
logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('fbest', es.result.fbest)
train_logs.append(logger.logs)
if generation == 0 or (generation+1) % config['log.freq'] == 0:
logger.dump(keys=None, index=0, indent=0, border='-'*50)
if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))):
agent.from_vec(tensorify(es.result.xbest, 'cpu'))
agent.checkpoint(logdir, generation+1)
checkpoint_count += 1
pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
return None
next_observation, reward, done, info = self.eval_env.step(action)
if done[0]: # [0] single environment
returns.append(info[0]['episode']['return'])
horizons.append(info[0]['episode']['horizon'])
break
observation = next_observation
logger = Logger()
logger('num_seconds', round(perf_counter() - start_time, 1))
logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
return logger.logs
[logger(key, value) for key, value in out_agent.items()]
logger('num_trajectories', len(D))
logger('num_timesteps', sum([len(traj) for traj in D]))
logger('accumulated_trained_timesteps', self.agent.total_timestep)
G = [traj.numpy_rewards.sum() for traj in D]
logger('return', describe(G, axis=-1, repr_indent=1, repr_prefix='\n'))
infos = [info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info]
online_returns = [info['episode']['return'] for info in infos]
online_horizons = [info['episode']['horizon'] for info in infos]
logger('online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
monitor_env = get_wrapper(self.env, 'VecMonitor')
logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
return logger
logger = Logger()
logger('train_iteration', n+1)
logger('num_seconds', round(perf_counter() - start_time, 1))
[logger(key, value) for key, value in out_agent.items()]
logger('num_trajectories', len(D))
logger('num_timesteps', sum([len(traj) for traj in D]))
logger('accumulated_trained_timesteps', self.agent.total_timestep)
G = [traj.numpy_rewards.sum() for traj in D]
logger('return', describe(G, axis=-1, repr_indent=1, repr_prefix='\n'))
infos = [info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info]
online_returns = [info['episode']['return'] for info in infos]
online_horizons = [info['episode']['horizon'] for info in infos]
logger('online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
monitor_env = get_wrapper(self.env, 'VecMonitor')
logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
return logger
observation = self.eval_env.reset()
for _ in range(self.eval_env.spec.max_episode_steps):
with torch.no_grad():
action = self.agent.choose_action(observation, mode='eval')['action']
next_observation, reward, done, info = self.eval_env.step(action)
if done[0]: # [0] single environment
returns.append(info[0]['episode']['return'])
horizons.append(info[0]['episode']['horizon'])
break
observation = next_observation
logger = Logger()
logger('num_seconds', round(perf_counter() - start_time, 1))
logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
return logger.logs
if self.config['agent.use_lr_scheduler']:
self.lr_scheduler.step(self.total_timestep)
self.optimizer.step()
self.total_timestep += sum([len(traj) for traj in D])
out = {}
if self.config['agent.use_lr_scheduler']:
out['current_lr'] = self.lr_scheduler.get_lr()
out['loss'] = loss.item()
out['grad_norm'] = grad_norm
out['policy_loss'] = policy_loss.mean().item()
out['entropy_loss'] = entropy_loss.mean().item()
out['policy_entropy'] = -entropy_loss.mean().item()
out['value_loss'] = value_loss.mean().item()
Vs_numpy = Vs.detach().cpu().numpy().squeeze()
out['V'] = describe(Vs_numpy, axis=-1, repr_indent=1, repr_prefix='\n')
out['explained_variance'] = ev(y_true=Qs.detach().cpu().numpy(), y_pred=Vs.detach().cpu().numpy())
return out
for _ in range(self.config['eval.num_episode']):
observation = self.eval_env.reset()
for _ in range(self.eval_env.spec.max_episode_steps):
with torch.no_grad():
action = self.agent.choose_action(observation, mode='eval')['action']
next_observation, reward, done, info = self.eval_env.step(action)
if done[0]: # [0] single environment
returns.append(info[0]['episode']['return'])
horizons.append(info[0]['episode']['horizon'])
break
observation = next_observation
logger = Logger()
logger('num_seconds', round(perf_counter() - start_time, 1))
logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
return logger.logs
def eval(self, n=None, **kwargs):
t0 = time.perf_counter()
with torch.no_grad():
D = self.runner(self.agent, self.eval_env, 10, mode='eval')
logger = Logger()
logger('eval_iteration', n+1)
logger('num_seconds', round(time.perf_counter() - t0, 1))
logger('accumulated_trained_timesteps', self.agent.total_timestep)
logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
return logger.logs
next_observation, reward, done, info = self.eval_env.step(action)
if done[0]: # [0] single environment
returns.append(info[0]['episode']['return'])
horizons.append(info[0]['episode']['horizon'])
break
observation = next_observation
logger = Logger()
logger('num_seconds', round(perf_counter() - start_time, 1))
logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
return logger.logs