How to use the trfl.policy_gradient_ops function in trfl

To help you get started, we’ve selected a few trfl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fomorians-oss / pyoneer / pyoneer / rl / agents / vanilla_policy_gradient_agent_impl.py View on Github external
returns = _discounted_returns(rewards, decay, weights)
        self.value.fit(states, returns)

        action_values = (returns - array_ops.squeeze(self.value(states, training=True), axis=-1))
        action_values *= weights
        if normalize_action_values:
            action_values = normalization_ops.weighted_moments_normalize(action_values, weights)

        policy = self.policy(states, training=True)
        log_prob = policy.log_prob(actions)
        policy_gradient_loss = gen_array_ops.stop_gradient(action_values) * -log_prob
        self.policy_gradient_loss = losses_impl.compute_weighted_loss(
            policy_gradient_loss,
            weights=weights)

        entropy_loss = policy_gradient_ops.policy_entropy_loss(
            policy, 
            self.policy.trainable_variables,
            lambda policies: entropy_scale).loss
        self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
            entropy_loss,
            weights=weights)

        self.total_loss = math_ops.add_n([
            self.policy_gradient_loss, 
            self.policy_gradient_entropy_loss])

        return self.total_loss
github fomorians-oss / pyoneer / pyoneer / rl / agents / v_trace_proximal_policy_optimization_agent_impl.py View on Github external
parray_ops.swap_time_major(rollouts.rewards),
            baseline_values,
            bootstrap_values)

        advantages = parray_ops.swap_time_major(vtrace_returns.pg_advantages)
        advantages = normalization_ops.weighted_moments_normalize(advantages, rollouts.weights)
        advantages = gen_array_ops.stop_gradient(advantages)

        ratio = parray_ops.swap_time_major(gen_math_ops.exp(log_rhos))
        clipped_ratio = clip_ops.clip_by_value(ratio, 1. - ratio_epsilon, 1. + ratio_epsilon)

        self.policy_gradient_loss = -losses_impl.compute_weighted_loss(
            gen_math_ops.minimum(advantages * ratio, advantages * clipped_ratio), 
            weights=rollouts.weights)

        entropy_loss = policy_gradient_ops.policy_entropy_loss(
            policy, 
            self.policy.trainable_variables,
            lambda policies: entropy_scale).loss
        entropy_loss = parray_ops.expand_to(entropy_loss, ndims=3)
        entropy_loss = math_ops.reduce_sum(entropy_loss, axis=-1)
        self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
            entropy_loss,
            weights=rollouts.weights)

        self.value_loss = math_ops.reduce_mean(
            math_ops.multiply(
                math_ops.reduce_sum(math_ops.square(vtrace_returns.vs - baseline_values), axis=0), 
                baseline_scale * .5),
            axis=0)

        self.total_loss = math_ops.add_n([
github fomorians-oss / pyoneer / pyoneer / rl / agents / proximal_policy_optimization_agent_impl.py View on Github external
parray_ops.swap_time_major(lambda_))

        advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
        advantages = normalization_ops.normalize_by_moments(advantages, weights)
        advantages = gen_array_ops.stop_gradient(advantages)

        ratio = gen_math_ops.exp(
            policy.log_prob(actions) - gen_array_ops.stop_gradient(
                behavioral_policy.log_prob(actions)))
        clipped_ratio = clip_ops.clip_by_value(ratio, 1. - ratio_epsilon, 1. + ratio_epsilon)

        self.policy_gradient_loss = -losses_impl.compute_weighted_loss(
            gen_math_ops.minimum(advantages * ratio, advantages * clipped_ratio), 
            weights=weights)

        entropy_loss = policy_gradient_ops.policy_entropy_loss(
            policy, 
            self.policy.trainable_variables,
            lambda policies: entropy_scale).loss
        self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
            entropy_loss,
            weights=weights)

        self.value_loss = pmath_ops.safe_divide(
            baseline_scale * math_ops.reduce_sum(baseline_loss), total_num)
        self.value_loss = gen_array_ops.check_numerics(
            self.value_loss, 'value_loss')

        self.total_loss = math_ops.add_n([
            self.value_loss,
            self.policy_gradient_loss, 
            self.policy_gradient_entropy_loss])
github fomorians-oss / pyoneer / pyoneer / rl / agents / v_trace_advantage_actor_critic_agent_impl.py View on Github external
pcontinues,
            parray_ops.swap_time_major(rewards),
            baseline_values,
            bootstrap_values)

        advantages = parray_ops.swap_time_major(vtrace_returns.pg_advantages)
        if normalize_advantages:
            advantages = normalization_ops.normalize_by_moments(advantages, weights)
        advantages = gen_array_ops.stop_gradient(advantages)

        policy_gradient_loss = advantages * -log_prob
        self.policy_gradient_loss = losses_impl.compute_weighted_loss(
            policy_gradient_loss,
            weights=weights)

        entropy_loss = policy_gradient_ops.policy_entropy_loss(
            policy, 
            self.policy.trainable_variables,
            lambda policies: entropy_scale).loss
        self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
            entropy_loss,
            weights=weights)

        baseline_loss = math_ops.reduce_sum(
            math_ops.square(vtrace_returns.vs - baseline_values), axis=0)
        self.value_loss = pmath_ops.safe_divide(
            .5 * baseline_scale * math_ops.reduce_sum(baseline_loss), total_num)
        self.value_loss = gen_array_ops.check_numerics(
            self.value_loss, 'value_loss')

        self.total_loss = math_ops.add_n([
            self.value_loss,
github fomorians-oss / pyoneer / pyoneer / rl / agents / advantage_actor_critic_agent_impl.py View on Github external
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
        if normalize_advantages:
            advantages = normalization_ops.normalize_by_moments(advantages, weights)
        advantages = gen_array_ops.check_numerics(advantages, 'advantages')

        policy = self.policy(states, training=True)
        log_prob = policy.log_prob(actions)
        policy_gradient_loss =  gen_array_ops.stop_gradient(advantages) * -log_prob
        self.policy_gradient_loss = losses_impl.compute_weighted_loss(
            policy_gradient_loss,
            weights=weights)
        self.policy_gradient_loss = gen_array_ops.check_numerics(
            self.policy_gradient_loss, 'policy_gradient_loss')

        entropy_loss = policy_gradient_ops.policy_entropy_loss(
            policy, 
            self.policy.trainable_variables,
            lambda policies: entropy_scale).loss
        self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
            entropy_loss,
            weights=weights)
        self.policy_gradient_entropy_loss = gen_array_ops.check_numerics(
            self.policy_gradient_entropy_loss, 'policy_gradient_entropy_loss')

        self.value_loss = pmath_ops.safe_divide(
            baseline_scale * math_ops.reduce_sum(baseline_loss), total_num)
        self.value_loss = gen_array_ops.check_numerics(
            self.value_loss, 'value_loss')

        self.total_loss = math_ops.add_n([
            self.value_loss,