Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.beta = tf.get_variable(
"gail_beta",
[],
trainable=False,
dtype=tf.float32,
initializer=tf.ones_initializer(),
)
self.discriminator_loss = -tf.reduce_mean(
tf.log(self.expert_estimate + EPSILON)
+ tf.log(1.0 - self.policy_estimate + EPSILON)
)
if self.use_vail:
# KL divergence loss (encourage latent representation to be normal)
self.kl_loss = tf.reduce_mean(
-tf.reduce_sum(
1
+ self.z_log_sigma_sq
- 0.5 * tf.square(self.z_mean_expert)
- 0.5 * tf.square(self.z_mean_policy)
- tf.exp(self.z_log_sigma_sq),
1,
)
)
self.loss = (
self.beta * (self.kl_loss - self.mutual_information)
+ self.discriminator_loss
)
else:
self.loss = self.discriminator_loss
def create_sac_value_head(
self, stream_names, hidden_input, num_layers, h_size, scope
):
"""
Creates one value estimator head for each reward signal in stream_names.
Also creates the node corresponding to the mean of all the value heads in self.value.
self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
:param stream_names: The list of reward signal names
:param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
of the hidden input.
:param num_layers: Number of hidden layers for value network
:param h_size: size of hidden layers for value network
:param scope: TF scope for value network.
"""
with tf.variable_scope(scope):
value_hidden = self.create_vector_observation_encoder(
hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
)
if self.use_recurrent:
value_hidden, memory_out = self.create_recurrent_encoder(
value_hidden,
self.value_memory_in,
self.sequence_length,
name="lstm_value",
)
self.value_memory_out = memory_out
self.create_value_heads(stream_names, value_hidden)
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
* advantage
)
self.policy_loss = -tf.reduce_mean(
tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
)
# For cleaner stats reporting
self.abs_policy_loss = tf.abs(self.policy_loss)
self.loss = (
self.policy_loss
+ 0.5 * self.value_loss
- decay_beta
* tf.reduce_mean(tf.dynamic_partition(entropy, self.mask, 2)[1])
)
"""
Creates inverse model TensorFlow ops for Curiosity module.
Predicts action taken given current and future encoded states.
:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
if self.policy.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(
hidden, self.policy.act_size[0], activation=None
)
squared_difference = tf.reduce_sum(
tf.squared_difference(pred_action, self.policy.selected_actions), axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
)
else:
pred_action = tf.concat(
[
tf.layers.dense(
hidden, self.policy.act_size[i], activation=tf.nn.softmax
)
for i in range(len(self.policy.act_size))
],
axis=1,
)
cross_entropy = tf.reduce_sum(
-tf.log(pred_action + 1e-10) * self.policy.selected_actions, axis=1
)
self.inverse_loss = tf.reduce_mean(
def __init__(
self, policy: TFPolicy, encoding_size: int = 128, learning_rate: float = 3e-4
):
"""
Creates the curiosity model for the Curiosity reward Generator
:param policy: The policy being trained
:param encoding_size: The size of the encoding for the Curiosity module
:param learning_rate: The learning rate for the curiosity module
"""
self.encoding_size = encoding_size
self.policy = policy
self.next_visual_in: List[tf.Tensor] = []
encoded_state, encoded_next_state = self.create_curiosity_encoders()
self.create_inverse_model(encoded_state, encoded_next_state)
self.create_forward_model(encoded_state, encoded_next_state)
self.create_loss(learning_rate)