How to use the mlagents.tf_utils.tf.stack function in mlagents

To help you get started, we’ve selected a few mlagents examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / ppo / multi_gpu_policy.py View on Github external
def average_gradients(self, tower_grads):
        """
        Average gradients from all towers
        :param tower_grads: Gradients from all towers
        """
        average_grads = []
        for grad_and_vars in zip(*tower_grads):
            grads = [g for g, _ in grad_and_vars if g is not None]
            if not grads:
                continue
            avg_grad = tf.reduce_mean(tf.stack(grads), 0)
            var = grad_and_vars[0][1]
            average_grads.append((avg_grad, var))
        return average_grads
github StepNeverStop / RLs / mlagents / trainers / sac / models.py View on Github external
)
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_log_probs = self.apply_as_branches(
                self.policy_network.all_log_probs
            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
                    for _lp, _te in zip(branched_log_probs, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.mask)
                * tf.reduce_mean(
                    self.log_ent_coef
                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                )
            )

            # Same with policy loss, we have to do the loss per branch and average them,
github StepNeverStop / RLs / mlagents / trainers / ppo / multi_gpu_policy.py View on Github external
stream_names=list(reward_signal_configs.keys()),
                                vis_encode_type=EncoderType(
                                    trainer_params.get("vis_encode_type", "simple")
                                ),
                            )
                        )
                        self.towers[-1].create_ppo_optimizer()
            self.model = self.towers[0]
            avg_grads = self.average_gradients([t.grads for t in self.towers])
            update_batch = self.model.optimizer.apply_gradients(avg_grads)

            avg_value_loss = tf.reduce_mean(
                tf.stack([model.value_loss for model in self.towers]), 0
            )
            avg_policy_loss = tf.reduce_mean(
                tf.stack([model.policy_loss for model in self.towers]), 0
            )

        self.inference_dict.update(
            {
                "action": self.model.output,
                "log_probs": self.model.all_log_probs,
                "value_heads": self.model.value_heads,
                "value": self.model.value,
                "entropy": self.model.entropy,
                "learning_rate": self.model.learning_rate,
            }
        )
        if self.use_continuous_act:
            self.inference_dict["pre_action"] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.model.memory_out
github StepNeverStop / RLs / mlagents / trainers / sac / optimizer.py View on Github external
)
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_per_action_ent = ModelUtils.break_into_branches(
                per_action_entropy, self.act_size
            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
                    for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.policy.mask)
                * tf.reduce_mean(
                    self.log_ent_coef
                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                )
            )

            # Same with policy loss, we have to do the loss per branch and average them,
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / ppo / models.py View on Github external
),
                            logits=self.all_log_probs[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                        )
                        for i in range(len(self.act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (
                tf.stack(
                    [
                        -tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
                            logits=normalized_logits[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                        )
                        for i in range(len(self.act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
            keepdims=True,
        )
        self.old_log_probs = tf.reduce_sum(
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / sac / models.py View on Github external
)

            branched_policy_loss = tf.stack(
                [
                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
                    for i, (_lp, _qt) in enumerate(
                        zip(branched_log_probs, branched_q_term)
                    )
                ]
            )
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.mask) * tf.squeeze(branched_policy_loss)
            )

            # Do vbackup entropy bonus per branch as well.
            branched_ent_bonus = tf.stack(
                [
                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
                    for i, _lp in enumerate(branched_log_probs)
                ]
            )
            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name]
                    - tf.reduce_mean(branched_ent_bonus, axis=0)
                )
                value_losses.append(
                    0.5
                    * tf.reduce_mean(
                        tf.to_float(self.mask)
                        * tf.squared_difference(
github StepNeverStop / RLs / mlagents / trainers / sac / models.py View on Github external
self.target_entropy = (
                -1
                * CONTINUOUS_TARGET_ENTROPY_SCALE
                * np.prod(self.act_size[0]).astype(np.float32)
            )

        self.rewards_holders = {}
        self.min_policy_qs = {}

        for name in stream_names:
            if discrete:
                _branched_mpq1 = self.apply_as_branches(
                    self.policy_network.q1_pheads[name]
                    * self.policy_network.action_probs
                )
                branched_mpq1 = tf.stack(
                    [
                        tf.reduce_sum(_br, axis=1, keep_dims=True)
                        for _br in _branched_mpq1
                    ]
                )
                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)

                _branched_mpq2 = self.apply_as_branches(
                    self.policy_network.q2_pheads[name]
                    * self.policy_network.action_probs
                )
                branched_mpq2 = tf.stack(
                    [
                        tf.reduce_sum(_br, axis=1, keep_dims=True)
                        for _br in _branched_mpq2
                    ]
github Unity-Technologies / ml-agents / ml-agents / mlagents / trainers / ppo / models.py View on Github external
axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
        )
        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.action_masks, self.act_size
        )

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (
                tf.stack(
                    [
                        tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=tf.nn.softmax(
                                self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
                            ),
                            logits=self.all_log_probs[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                        )
                        for i in range(len(self.act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
        )
github StepNeverStop / RLs / mlagents / trainers / ppo / optimizer.py View on Github external
)

        # Break old log probs into separate branches
        old_log_prob_branches = ModelUtils.break_into_branches(
            self.all_old_log_probs, self.policy.act_size
        )

        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            old_log_prob_branches, self.policy.action_masks, self.policy.act_size
        )

        action_idx = [0] + list(np.cumsum(self.policy.act_size))

        self.old_log_probs = tf.reduce_sum(
            (
                tf.stack(
                    [
                        -tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=self.policy.selected_actions[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                            logits=old_normalized_logits[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                        )
                        for i in range(len(self.policy.act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
            keepdims=True,
github StepNeverStop / RLs / mlagents / trainers / ppo / models.py View on Github external
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
                            logits=normalized_logits[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                        )
                        for i in range(len(self.act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
            keepdims=True,
        )
        self.old_log_probs = tf.reduce_sum(
            (
                tf.stack(
                    [
                        -tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
                            logits=old_normalized_logits[
                                :, action_idx[i] : action_idx[i + 1]
                            ],
                        )
                        for i in range(len(self.act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
            keepdims=True,
        )