Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.advantage = tf.placeholder(
shape=[None], dtype=tf.float32, name="advantages"
)
advantage = tf.expand_dims(self.advantage, -1)
decay_epsilon = tf.train.polynomial_decay(
epsilon, self.global_step, max_step, 0.1, power=1.0
)
decay_beta = tf.train.polynomial_decay(
beta, self.global_step, max_step, 1e-5, power=1.0
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-decay_epsilon,
decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
for i in range(len(self.act_size))
],
axis=1,
)
self.selected_actions = tf.stop_gradient(self.action_oh)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
)
_, _, old_normalized_logits = self.create_discrete_action_masking_layer(
self.all_old_log_probs, self.action_masks, self.act_size
)
action_idx = [0] + list(np.cumsum(self.act_size))
self.entropy = tf.reduce_sum(
(
tf.stack(
[
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.nn.softmax(
self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
),
logits=self.all_log_probs[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX)
sigma_sq = tf.exp(self.log_sigma_sq)
# Do the reparameterization trick
policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq
_gauss_pre = -0.5 * (
((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON)) ** 2
+ 2 * self.log_sigma_sq
+ np.log(2 * np.pi)
)
all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)
self.entropy = tf.reduce_sum(
self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1
)
# Squash probabilities
# Keep deterministic around in case we want to use it.
self.deterministic_output = tf.tanh(mu)
# Note that this is just for symmetry with PPO.
self.output_pre = tf.tanh(policy_)
# Squash correction
all_probs -= tf.reduce_sum(
tf.log(1 - self.output_pre ** 2 + EPSILON), axis=1, keepdims=True
)
self.all_log_probs = all_probs
value_losses.append(
0.5
* tf.reduce_mean(
tf.to_float(self.mask)
* tf.squared_difference(
self.policy_network.value_heads[name], v_backup
)
)
)
else:
self.entropy_loss = -tf.reduce_mean(
self.log_ent_coef
* tf.to_float(self.mask)
* tf.stop_gradient(
tf.reduce_sum(
self.policy_network.all_log_probs + self.target_entropy,
axis=1,
keep_dims=True,
)
)
)
batch_policy_loss = tf.reduce_mean(
self.ent_coef * self.policy_network.all_log_probs
- self.policy_network.q1_p,
axis=1,
)
self.policy_loss = tf.reduce_mean(
tf.to_float(self.mask) * batch_policy_loss
)
value_losses = []
def create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
"""
Creates inverse model TensorFlow ops for Curiosity module.
Predicts action taken given current and future encoded states.
:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
if self.policy.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(
hidden, self.policy.act_size[0], activation=None
)
squared_difference = tf.reduce_sum(
tf.squared_difference(pred_action, self.policy.selected_actions), axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
)
else:
pred_action = tf.concat(
[
tf.layers.dense(
hidden, self.policy.act_size[i], activation=tf.nn.softmax
)
for i in range(len(self.policy.act_size))
],
axis=1,
)
cross_entropy = tf.reduce_sum(
name="lstm_value",
)
self.memory_out = memory_value_out
else:
hidden_value = hidden_stream
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
self.old_log_probs = tf.reduce_sum(
(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
if discrete:
# We need to break up the Q functions by branch, and update them individually.
branched_q1_stream = self.apply_as_branches(
self.policy_network.external_action_in * q1_streams[name]
)
branched_q2_stream = self.apply_as_branches(
self.policy_network.external_action_in * q2_streams[name]
)
# Reduce each branch into scalar
branched_q1_stream = [
tf.reduce_sum(_branch, axis=1, keep_dims=True)
for _branch in branched_q1_stream
]
branched_q2_stream = [
tf.reduce_sum(_branch, axis=1, keep_dims=True)
for _branch in branched_q2_stream
]
q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
else:
q1_stream = q1_streams[name]
q2_stream = q2_streams[name]
_q1_loss = 0.5 * tf.reduce_mean(
tf.to_float(self.mask) * tf.squared_difference(q_backup, q1_stream)
)
_q2_loss = 0.5 * tf.reduce_mean(
tf.to_float(self.mask) * tf.squared_difference(q_backup, q2_stream)
"gail_beta",
[],
trainable=False,
dtype=tf.float32,
initializer=tf.ones_initializer(),
)
self.discriminator_loss = -tf.reduce_mean(
tf.log(self.expert_estimate + EPSILON)
+ tf.log(1.0 - self.policy_estimate + EPSILON)
)
if self.use_vail:
# KL divergence loss (encourage latent representation to be normal)
self.kl_loss = tf.reduce_mean(
-tf.reduce_sum(
1
+ self.z_log_sigma_sq
- 0.5 * tf.square(self.z_mean_expert)
- 0.5 * tf.square(self.z_mean_policy)
- tf.exp(self.z_log_sigma_sq),
1,
)
)
self.loss = (
self.beta * (self.kl_loss - self.mutual_information)
+ self.discriminator_loss
)
else:
self.loss = self.discriminator_loss
if self.gradient_penalty_weight > 0.0:
self.advantage = tf.placeholder(
shape=[None], dtype=tf.float32, name="advantages"
)
advantage = tf.expand_dims(self.advantage, -1)
decay_epsilon = tf.train.polynomial_decay(
epsilon, self.global_step, max_step, 0.1, power=1.0
)
decay_beta = tf.train.polynomial_decay(
beta, self.global_step, max_step, 1e-5, power=1.0
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-decay_epsilon,
decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)