How to use the mlagents.tf_utils.tf.exp function in mlagents

To help you get started, we’ve selected a few mlagents examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / sac / models.py View on Github external
name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (
                ((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON)) ** 2
                + 2 * self.log_sigma_sq
                + np.log(2 * np.pi)
            )

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(
                self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1
            )
github StepNeverStop / RLs / mlagents / trainers / sac / models.py View on Github external
name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (
                ((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON)) ** 2
                + 2 * self.log_sigma_sq
                + np.log(2 * np.pi)
            )

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(
                self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1
            )
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / ppo / models.py View on Github external
-decay_epsilon,
                decay_epsilon,
            )
            v_opt_a = tf.squared_difference(
                self.returns_holders[name], tf.reduce_sum(head, axis=1)
            )
            v_opt_b = tf.squared_difference(
                self.returns_holders[name], clipped_value_estimate
            )
            value_loss = tf.reduce_mean(
                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
            )
            value_losses.append(value_loss)
        self.value_loss = tf.reduce_mean(value_losses)

        r_theta = tf.exp(probs - old_probs)
        p_opt_a = r_theta * advantage
        p_opt_b = (
            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
            * advantage
        )
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
        )
        # For cleaner stats reporting
        self.abs_policy_loss = tf.abs(self.policy_loss)

        self.loss = (
            self.policy_loss
            + 0.5 * self.value_loss
            - decay_beta
            * tf.reduce_mean(tf.dynamic_partition(entropy, self.mask, 2)[1])
github StepNeverStop / RLs / mlagents / trainers / sac / models.py View on Github external
"log_ent_coef",
                dtype=tf.float32,
                initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
                    np.float32
                ),
                trainable=True,
            )
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_log_probs = self.apply_as_branches(
                self.policy_network.all_log_probs
            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
                    for _lp, _te in zip(branched_log_probs, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.mask)
                * tf.reduce_mean(
                    self.log_ent_coef
github StepNeverStop / RLs / mlagents / trainers / sac / optimizer.py View on Github external
"log_ent_coef",
                dtype=tf.float32,
                initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
                    np.float32
                ),
                trainable=True,
            )
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_per_action_ent = ModelUtils.break_into_branches(
                per_action_entropy, self.act_size
            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
                    for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.policy.mask)
                * tf.reduce_mean(
                    self.log_ent_coef
github StepNeverStop / RLs / mlagents / trainers / ppo / models.py View on Github external
-decay_epsilon,
                decay_epsilon,
            )
            v_opt_a = tf.squared_difference(
                self.returns_holders[name], tf.reduce_sum(head, axis=1)
            )
            v_opt_b = tf.squared_difference(
                self.returns_holders[name], clipped_value_estimate
            )
            value_loss = tf.reduce_mean(
                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
            )
            value_losses.append(value_loss)
        self.value_loss = tf.reduce_mean(value_losses)

        r_theta = tf.exp(probs - old_probs)
        p_opt_a = r_theta * advantage
        p_opt_b = (
            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
            * advantage
        )
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
        )
        # For cleaner stats reporting
        self.abs_policy_loss = tf.abs(self.policy_loss)

        self.loss = (
            self.policy_loss
            + 0.5 * self.value_loss
            - decay_beta
            * tf.reduce_mean(tf.dynamic_partition(entropy, self.mask, 2)[1])
github StepNeverStop / RLs / mlagents / trainers / sac / optimizer.py View on Github external
"""
        Creates training-specific Tensorflow ops for SAC models.
        :param q1_streams: Q1 streams from policy network
        :param q1_streams: Q2 streams from policy network
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        :param stream_names: List of reward stream names.
        :param discrete: Whether or not to use discrete action losses.
        """

        if discrete:
            self.target_entropy = [
                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
                for i in self.act_size
            ]
            discrete_action_probs = tf.exp(self.policy.all_log_probs)
            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
        else:
            self.target_entropy = (
                -1
                * self.continuous_target_entropy_scale
                * np.prod(self.act_size[0]).astype(np.float32)
            )

        self.rewards_holders = {}
        self.min_policy_qs = {}

        for name in stream_names:
            if discrete:
                _branched_mpq1 = ModelUtils.break_into_branches(
                    self.policy_network.q1_pheads[name] * discrete_action_probs,
                    self.act_size,
github StepNeverStop / RLs / mlagents / trainers / components / reward_signals / gail / model.py View on Github external
)

        self.discriminator_loss = -tf.reduce_mean(
            tf.log(self.expert_estimate + EPSILON)
            + tf.log(1.0 - self.policy_estimate + EPSILON)
        )

        if self.use_vail:
            # KL divergence loss (encourage latent representation to be normal)
            self.kl_loss = tf.reduce_mean(
                -tf.reduce_sum(
                    1
                    + self.z_log_sigma_sq
                    - 0.5 * tf.square(self.z_mean_expert)
                    - 0.5 * tf.square(self.z_mean_policy)
                    - tf.exp(self.z_log_sigma_sq),
                    1,
                )
            )
            self.loss = (
                self.beta * (self.kl_loss - self.mutual_information)
                + self.discriminator_loss
            )
        else:
            self.loss = self.discriminator_loss

        if self.gradient_penalty_weight > 0.0:
            self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude()

        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update_batch = optimizer.minimize(self.loss)