Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _define_program(self):
self.ensemble_predict_program = fluid.Program()
self.startup_program = fluid.Program()
with fluid.program_guard(self.ensemble_predict_program,
self.startup_program):
obs = layers.data(name='obs', shape=[OBS_DIM], dtype='float32')
action = self._ensemble_predict(obs)
self.ensemble_predict_output = [action]
def __init__(self, act_dim):
self.conv1 = layers.conv2d(
num_filters=32, filter_size=8, stride=4, padding=1, act='relu')
self.conv2 = layers.conv2d(
num_filters=64, filter_size=4, stride=2, padding=2, act='relu')
self.conv3 = layers.conv2d(
num_filters=64, filter_size=3, stride=1, padding=0, act='relu')
self.fc = layers.fc(size=512, act='relu')
self.policy_fc = layers.fc(size=act_dim)
self.value_fc = layers.fc(size=1)
def __init__(self, act_dim):
hid1_size = 256
hid2_size = 256
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=hid2_size, act='tanh')
self.fc3 = layers.fc(size=act_dim)
def _critic_learn(self, obs, action, reward, next_obs, terminal, critic_lr,
model_id):
next_action = self.target_models[model_id].policy(next_obs)
next_Q = self.target_models[model_id].value(next_obs, next_action)
terminal = layers.cast(terminal, dtype='float32')
target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
target_Q.stop_gradient = True
Q = self.models[model_id].value(obs, action)
cost = layers.square_error_cost(Q, target_Q)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.AdamOptimizer(critic_lr)
optimizer.minimize(cost)
return cost
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(
name='act', shape=[self.act_dim], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
_, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
terminal)
def ensemble_predict(self, obs):
""" ensemble predict:
1. For actions of all actors, each critic will score them
and normalize its scores;
2. For each actor, will calculate its score by
average scores given by all critics
3. choose action of the actor whose score is best
"""
actor_outputs = []
for i in range(self.ensemble_num):
actor_outputs.append(self.models[i].policy(obs))
batch_actions = layers.concat(actor_outputs, axis=0)
batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])
critic_outputs = []
for i in range(self.ensemble_num):
critic_output = self.models[i].value(batch_obs, batch_actions)
critic_output = layers.unsqueeze(critic_output, axes=[1])
critic_outputs.append(critic_output)
score_matrix = layers.concat(critic_outputs, axis=1)
# Normalize scores given by each critic
sum_critic_score = layers.reduce_sum(
score_matrix, dim=0, keep_dim=True)
sum_critic_score = layers.expand(
sum_critic_score, expand_times=[self.ensemble_num, 1])
norm_score_matrix = score_matrix / sum_critic_score
def build_program(self):
self.sample_program = fluid.Program()
self.predict_program = fluid.Program()
self.value_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.sample_program):
obs = layers.data(
name='obs', shape=self.obs_shape, dtype='float32')
sample_actions, values = self.alg.sample(obs)
self.sample_outputs = [sample_actions, values]
with fluid.program_guard(self.predict_program):
obs = layers.data(
name='obs', shape=self.obs_shape, dtype='float32')
self.predict_actions = self.alg.predict(obs)
with fluid.program_guard(self.value_program):
obs = layers.data(
name='obs', shape=self.obs_shape, dtype='float32')
self.values = self.alg.value(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(