Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self._value = self.alg.define_predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs',
shape=[self._obs_dim],
dtype='float32'
)
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs',
shape=[self._obs_dim],
dtype='float32'
)
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self._cost = self.alg.define_learn(
obs, action, reward, next_obs, terminal)
name='obs', shape=[self.obs_dim], dtype='float32')
actions = layers.data(
name='actions', shape=[self.act_dim], dtype='float32')
advantages = layers.data(
name='advantages', shape=[1], dtype='float32')
if self.loss_type == 'KLPEN':
beta = layers.data(name='beta', shape=[], dtype='float32')
loss, kl = self.alg.policy_learn(obs, actions, advantages,
beta)
else:
loss, kl = self.alg.policy_learn(obs, actions, advantages)
self.policy_learn_output = [loss, kl]
with fluid.program_guard(self.value_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
value = self.alg.value_predict(obs)
self.value_predict_output = [value]
with fluid.program_guard(self.value_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
val = layers.data(name='val', shape=[], dtype='float32')
value_loss = self.alg.value_learn(obs, val)
self.value_learn_output = [value_loss]
def build_program(self):
self.policy_predict_program = fluid.Program()
self.policy_sample_program = fluid.Program()
self.policy_learn_program = fluid.Program()
self.value_predict_program = fluid.Program()
self.value_learn_program = fluid.Program()
with fluid.program_guard(self.policy_sample_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
sampled_act = self.alg.sample(obs)
self.policy_sample_output = [sampled_act]
with fluid.program_guard(self.policy_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
means = self.alg.predict(obs)
self.policy_predict_output = [means]
with fluid.program_guard(self.policy_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
actions = layers.data(
name='actions', shape=[self.act_dim], dtype='float32')
advantages = layers.data(
name='advantages', shape=[1], dtype='float32')
if self.loss_type == 'KLPEN':
beta = layers.data(name='beta', shape=[], dtype='float32')
loss, kl = self.alg.policy_learn(obs, actions, advantages,
beta)
else:
with fluid.program_guard(self.predict_program):
obs = layers.data(
name='obs', shape=self.obs_shape, dtype='float32')
self.predict_actions = self.alg.predict(obs)
with fluid.program_guard(self.value_program):
obs = layers.data(
name='obs', shape=self.obs_shape, dtype='float32')
self.values = self.alg.value(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=self.obs_shape, dtype='float32')
actions = layers.data(name='actions', shape=[], dtype='int64')
advantages = layers.data(
name='advantages', shape=[], dtype='float32')
target_values = layers.data(
name='target_values', shape=[], dtype='float32')
lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data(
name='entropy_coeff', shape=[], dtype='float32')
total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
obs, actions, advantages, target_values, lr, entropy_coeff)
self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
self.learn_program = parl.compile(self.learn_program, total_loss)
beta)
else:
loss, kl = self.alg.policy_learn(obs, actions, advantages)
self.policy_learn_output = [loss, kl]
with fluid.program_guard(self.value_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
value = self.alg.value_predict(obs)
self.value_predict_output = [value]
with fluid.program_guard(self.value_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
val = layers.data(name='val', shape=[], dtype='float32')
value_loss = self.alg.value_learn(obs, val)
self.value_learn_output = [value_loss]
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.act_prob = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
self.cost = self.alg.learn(obs, act, reward)
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs',
shape=[self._obs_dim],
dtype='float32'
)
self._value = self.alg.define_predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs',
shape=[self._obs_dim],
dtype='float32'
)
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs',
shape=[self._obs_dim],
dtype='float32'
)
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self._cost = self.alg.define_learn(
obs, action, reward, next_obs, terminal)
with fluid.program_guard(self.policy_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
means = self.alg.predict(obs)
self.policy_predict_output = [means]
with fluid.program_guard(self.policy_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
actions = layers.data(
name='actions', shape=[self.act_dim], dtype='float32')
advantages = layers.data(
name='advantages', shape=[1], dtype='float32')
if self.loss_type == 'KLPEN':
beta = layers.data(name='beta', shape=[], dtype='float32')
loss, kl = self.alg.policy_learn(obs, actions, advantages,
beta)
else:
loss, kl = self.alg.policy_learn(obs, actions, advantages)
self.policy_learn_output = [loss, kl]
with fluid.program_guard(self.value_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
value = self.alg.value_predict(obs)
self.value_predict_output = [value]
with fluid.program_guard(self.value_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
beta = layers.data(name='beta', shape=[], dtype='float32')
loss, kl = self.alg.policy_learn(obs, actions, advantages,
beta)
else:
loss, kl = self.alg.policy_learn(obs, actions, advantages)
self.policy_learn_output = [loss, kl]
with fluid.program_guard(self.value_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
value = self.alg.value_predict(obs)
self.value_predict_output = [value]
with fluid.program_guard(self.value_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
val = layers.data(name='val', shape=[], dtype='float32')
value_loss = self.alg.value_learn(obs, val)
self.value_learn_output = [value_loss]
self.learn_programs = []
self.learn_programs_output = []
for i in range(self.ensemble_num):
predict_program = fluid.Program()
with fluid.program_guard(predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = self.alg.predict(obs, model_id=i)
self.predict_programs.append(predict_program)
self.predict_outputs.append([act.name])
learn_program = fluid.Program()
with fluid.program_guard(learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(
name='act', shape=[self.act_dim], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
actor_lr = layers.data(
name='actor_lr',
shape=[1],
dtype='float32',
append_batch_size=False)
critic_lr = layers.data(
name='critic_lr',
shape=[1],
dtype='float32',
append_batch_size=False)
actor_loss, critic_loss = self.alg.learn(