Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for _branch in branched_q1_stream
]
branched_q2_stream = [
tf.reduce_sum(_branch, axis=1, keep_dims=True)
for _branch in branched_q2_stream
]
q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
else:
q1_stream = q1_streams[name]
q2_stream = q2_streams[name]
_q1_loss = 0.5 * tf.reduce_mean(
tf.to_float(self.mask) * tf.squared_difference(q_backup, q1_stream)
)
_q2_loss = 0.5 * tf.reduce_mean(
tf.to_float(self.mask) * tf.squared_difference(q_backup, q2_stream)
)
q1_losses.append(_q1_loss)
q2_losses.append(_q2_loss)
self.q1_loss = tf.reduce_mean(q1_losses)
self.q2_loss = tf.reduce_mean(q2_losses)
# Learn entropy coefficient
if discrete:
# Create a log_ent_coef for each branch
self.log_ent_coef = tf.get_variable(
)
self.decay_beta = ModelUtils.create_schedule(
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-self.decay_epsilon,
self.decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
1
]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(
r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
)
)
self.policy_loss = tf.reduce_mean(
tf.to_float(self.policy.mask) * batch_policy_loss
)
value_losses = []
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
)
value_losses.append(
0.5
* tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(
self.policy_network.value_heads[name], v_backup
)
)
)
self.value_loss = tf.reduce_mean(value_losses)
self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
self.entropy = self.policy_network.entropy
decay_epsilon = tf.train.polynomial_decay(
epsilon, self.global_step, max_step, 0.1, power=1.0
)
decay_beta = tf.train.polynomial_decay(
beta, self.global_step, max_step, 1e-5, power=1.0
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-decay_epsilon,
decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
* advantage
]
branched_q2_stream = [
tf.reduce_sum(_branch, axis=1, keep_dims=True)
for _branch in branched_q2_stream
]
q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
else:
q1_stream = q1_streams[name]
q2_stream = q2_streams[name]
_q1_loss = 0.5 * tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(q_backup, q1_stream)
)
_q2_loss = 0.5 * tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(q_backup, q2_stream)
)
q1_losses.append(_q1_loss)
q2_losses.append(_q2_loss)
self.q1_loss = tf.reduce_mean(q1_losses)
self.q2_loss = tf.reduce_mean(q2_losses)
# Learn entropy coefficient
if discrete:
# Create a log_ent_coef for each branch
self.decay_epsilon = ModelUtils.create_schedule(
self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
)
self.decay_beta = ModelUtils.create_schedule(
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-self.decay_epsilon,
self.decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
1
]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
)
decay_beta = tf.train.polynomial_decay(
beta, self.global_step, max_step, 1e-5, power=1.0
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-decay_epsilon,
decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
* advantage
)
self.policy_loss = -tf.reduce_mean(
tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
def create_loss(self, learning_rate: float, anneal_steps: int) -> None:
"""
Creates the loss and update nodes for the BC module
:param learning_rate: The learning rate for the optimizer
:param anneal_steps: Number of steps over which to anneal the learning_rate
"""
selected_action = self.policy.output
if self.policy.use_continuous_act:
self.loss = tf.reduce_mean(
tf.squared_difference(selected_action, self.expert_action)
)
else:
log_probs = self.policy.all_log_probs
self.loss = tf.reduce_mean(
-tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action
)
if anneal_steps > 0:
self.annealed_learning_rate = tf.train.polynomial_decay(
learning_rate, self.policy.global_step, anneal_steps, 0.0, power=1.0
)
else:
self.annealed_learning_rate = tf.Variable(learning_rate)
optimizer = tf.train.AdamOptimizer(
learning_rate=self.annealed_learning_rate, name="bc_adam"