How to use the mlagents.tf_utils.tf.stop_gradient function in mlagents

To help you get started, we’ve selected a few mlagents examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github StepNeverStop / RLs / mlagents / trainers / sac / optimizer.py View on Github external
]
            )
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)
            )

            # Do vbackup entropy bonus per branch as well.
            branched_ent_bonus = tf.stack(
                [
                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
                    for i, _lp in enumerate(branched_per_action_ent)
                ]
            )
            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name]
                    - tf.reduce_mean(branched_ent_bonus, axis=0)
                )
                value_losses.append(
                    0.5
                    * tf.reduce_mean(
                        tf.to_float(self.policy.mask)
                        * tf.squared_difference(
                            self.policy_network.value_heads[name], v_backup
                        )
                    )
                )

        else:
            self.entropy_loss = -tf.reduce_mean(
                self.log_ent_coef
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / sac / models.py View on Github external
# We also have to do a different entropy and target_entropy per branch.
            branched_log_probs = self.apply_as_branches(
                self.policy_network.all_log_probs
            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
                    for _lp, _te in zip(branched_log_probs, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.mask)
                * tf.reduce_mean(
                    self.log_ent_coef
                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                )
            )

            # Same with policy loss, we have to do the loss per branch and average them,
            # so that larger branches don't get more weight.
            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
            branched_q_term = self.apply_as_branches(
                self.policy_network.action_probs * self.policy_network.q1_p
            )

            branched_policy_loss = tf.stack(
                [
                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
                    for i, (_lp, _qt) in enumerate(
                        zip(branched_log_probs, branched_q_term)
github StepNeverStop / RLs / mlagents / trainers / sac / models.py View on Github external
# Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
            )

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs, name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
github StepNeverStop / RLs / mlagents / trainers / policy / nn_policy.py View on Github external
self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
            self.output_pre = distribution.sample
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = distribution.sample
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
        self.entropy = distribution.entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.total_log_probs = distribution.total_log_probs
github StepNeverStop / RLs / mlagents / trainers / sac / optimizer.py View on Github external
self.policy_network.q2_pheads[name],
                )

            rewards_holder = tf.placeholder(
                shape=[None], dtype=tf.float32, name="{}_rewards".format(name)
            )
            self.rewards_holders[name] = rewards_holder

        q1_losses = []
        q2_losses = []
        # Multiple q losses per stream
        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
        for i, name in enumerate(stream_names):
            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)

            q_backup = tf.stop_gradient(
                _expanded_rewards
                + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
                * self.gammas[i]
                * self.target_network.value_heads[name]
            )

            if discrete:
                # We need to break up the Q functions by branch, and update them individually.
                branched_q1_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q1_streams[name], self.act_size
                )
                branched_q2_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q2_streams[name], self.act_size
                )

                # Reduce each branch into scalar
github StepNeverStop / RLs / mlagents / trainers / sac / optimizer.py View on Github external
# We also have to do a different entropy and target_entropy per branch.
            branched_per_action_ent = ModelUtils.break_into_branches(
                per_action_entropy, self.act_size
            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
                    for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.policy.mask)
                * tf.reduce_mean(
                    self.log_ent_coef
                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                )
            )

            # Same with policy loss, we have to do the loss per branch and average them,
            # so that larger branches don't get more weight.
            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
            branched_q_term = ModelUtils.break_into_branches(
                discrete_action_probs * self.policy_network.q1_p, self.act_size
            )

            branched_policy_loss = tf.stack(
                [
                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
                    for i, (_lp, _qt) in enumerate(
                        zip(branched_per_action_ent, branched_q_term)
github StepNeverStop / RLs / mlagents / trainers / ppo / models.py View on Github external
self.output = tf.identity(output)
        self.normalized_logits = tf.identity(normalized_logits, name="action")

        self.create_value_heads(self.stream_names, hidden)

        self.action_holder = tf.placeholder(
            shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
        )
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
        )
        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.action_masks, self.act_size
        )

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (
                tf.stack(
                    [
                        tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=tf.nn.softmax(
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / sac / models.py View on Github external
)
                value_losses.append(
                    0.5
                    * tf.reduce_mean(
                        tf.to_float(self.mask)
                        * tf.squared_difference(
                            self.policy_network.value_heads[name], v_backup
                        )
                    )
                )

        else:
            self.entropy_loss = -tf.reduce_mean(
                self.log_ent_coef
                * tf.to_float(self.mask)
                * tf.stop_gradient(
                    tf.reduce_sum(
                        self.policy_network.all_log_probs + self.target_entropy,
                        axis=1,
                        keep_dims=True,
                    )
                )
            )
            batch_policy_loss = tf.reduce_mean(
                self.ent_coef * self.policy_network.all_log_probs
                - self.policy_network.q1_p,
                axis=1,
            )
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.mask) * batch_policy_loss
            )
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / sac / models.py View on Github external
# Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
            )

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs, name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
github StepNeverStop / RLs / mlagents / trainers / ppo / models.py View on Github external
"log_sigma_squared",
            [self.act_size[0]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )

        sigma_sq = tf.exp(self.log_sigma_sq)

        self.epsilon = tf.placeholder(
            shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
        )
        # Clip and scale output to ensure actions are always within [-1, 1] range.
        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
        self.output = tf.identity(output_post, name="action")
        self.selected_actions = tf.stop_gradient(output_post)

        # Compute probability of model output.
        all_probs = (
            -0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq
            - 0.5 * tf.log(2.0 * np.pi)
            - 0.5 * self.log_sigma_sq
        )

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        self.entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + self.log_sigma_sq
        )

        self.create_value_heads(self.stream_names, hidden_value)