Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def average_gradients(self, tower_grads):
"""
Average gradients from all towers
:param tower_grads: Gradients from all towers
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
grads = [g for g, _ in grad_and_vars if g is not None]
if not grads:
continue
avg_grad = tf.reduce_mean(tf.stack(grads), 0)
var = grad_and_vars[0][1]
average_grads.append((avg_grad, var))
return average_grads
)
else:
self.log_ent_coef = tf.get_variable(
"log_ent_coef",
dtype=tf.float32,
initializer=np.log(self.init_entcoef).astype(np.float32),
trainable=True,
)
self.ent_coef = tf.exp(self.log_ent_coef)
if discrete:
# We also have to do a different entropy and target_entropy per branch.
branched_log_probs = self.apply_as_branches(
self.policy_network.all_log_probs
)
branched_ent_sums = tf.stack(
[
tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
for _lp, _te in zip(branched_log_probs, self.target_entropy)
],
axis=1,
)
self.entropy_loss = -tf.reduce_mean(
tf.to_float(self.mask)
* tf.reduce_mean(
self.log_ent_coef
* tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
axis=1,
)
)
# Same with policy loss, we have to do the loss per branch and average them,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
)
self.towers[-1].create_ppo_optimizer()
self.model = self.towers[0]
avg_grads = self.average_gradients([t.grads for t in self.towers])
update_batch = self.model.optimizer.apply_gradients(avg_grads)
avg_value_loss = tf.reduce_mean(
tf.stack([model.value_loss for model in self.towers]), 0
)
avg_policy_loss = tf.reduce_mean(
tf.stack([model.policy_loss for model in self.towers]), 0
)
self.inference_dict.update(
{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
)
else:
self.log_ent_coef = tf.get_variable(
"log_ent_coef",
dtype=tf.float32,
initializer=np.log(self.init_entcoef).astype(np.float32),
trainable=True,
)
self.ent_coef = tf.exp(self.log_ent_coef)
if discrete:
# We also have to do a different entropy and target_entropy per branch.
branched_per_action_ent = ModelUtils.break_into_branches(
per_action_entropy, self.act_size
)
branched_ent_sums = tf.stack(
[
tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
],
axis=1,
)
self.entropy_loss = -tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.reduce_mean(
self.log_ent_coef
* tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
axis=1,
)
)
# Same with policy loss, we have to do the loss per branch and average them,
),
logits=self.all_log_probs[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
)
self.log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
logits=normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)
self.old_log_probs = tf.reduce_sum(
)
branched_policy_loss = tf.stack(
[
tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
for i, (_lp, _qt) in enumerate(
zip(branched_log_probs, branched_q_term)
)
]
)
self.policy_loss = tf.reduce_mean(
tf.to_float(self.mask) * tf.squeeze(branched_policy_loss)
)
# Do vbackup entropy bonus per branch as well.
branched_ent_bonus = tf.stack(
[
tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
for i, _lp in enumerate(branched_log_probs)
]
)
value_losses = []
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- tf.reduce_mean(branched_ent_bonus, axis=0)
)
value_losses.append(
0.5
* tf.reduce_mean(
tf.to_float(self.mask)
* tf.squared_difference(
self.target_entropy = (
-1
* CONTINUOUS_TARGET_ENTROPY_SCALE
* np.prod(self.act_size[0]).astype(np.float32)
)
self.rewards_holders = {}
self.min_policy_qs = {}
for name in stream_names:
if discrete:
_branched_mpq1 = self.apply_as_branches(
self.policy_network.q1_pheads[name]
* self.policy_network.action_probs
)
branched_mpq1 = tf.stack(
[
tf.reduce_sum(_br, axis=1, keep_dims=True)
for _br in _branched_mpq1
]
)
_q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
_branched_mpq2 = self.apply_as_branches(
self.policy_network.q2_pheads[name]
* self.policy_network.action_probs
)
branched_mpq2 = tf.stack(
[
tf.reduce_sum(_br, axis=1, keep_dims=True)
for _br in _branched_mpq2
]
axis=1,
)
self.selected_actions = tf.stop_gradient(self.action_oh)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
)
_, _, old_normalized_logits = self.create_discrete_action_masking_layer(
self.all_old_log_probs, self.action_masks, self.act_size
)
action_idx = [0] + list(np.cumsum(self.act_size))
self.entropy = tf.reduce_sum(
(
tf.stack(
[
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.nn.softmax(
self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
),
logits=self.all_log_probs[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
)
)
# Break old log probs into separate branches
old_log_prob_branches = ModelUtils.break_into_branches(
self.all_old_log_probs, self.policy.act_size
)
_, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
old_log_prob_branches, self.policy.action_masks, self.policy.act_size
)
action_idx = [0] + list(np.cumsum(self.policy.act_size))
self.old_log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.policy.selected_actions[
:, action_idx[i] : action_idx[i + 1]
],
logits=old_normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.policy.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
logits=normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)
self.old_log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
logits=old_normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)