Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
returns = _discounted_returns(rewards, decay, weights)
self.value.fit(states, returns)
action_values = (returns - array_ops.squeeze(self.value(states, training=True), axis=-1))
action_values *= weights
if normalize_action_values:
action_values = normalization_ops.weighted_moments_normalize(action_values, weights)
policy = self.policy(states, training=True)
log_prob = policy.log_prob(actions)
policy_gradient_loss = gen_array_ops.stop_gradient(action_values) * -log_prob
self.policy_gradient_loss = losses_impl.compute_weighted_loss(
policy_gradient_loss,
weights=weights)
entropy_loss = policy_gradient_ops.policy_entropy_loss(
policy,
self.policy.trainable_variables,
lambda policies: entropy_scale).loss
self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
entropy_loss,
weights=weights)
self.total_loss = math_ops.add_n([
self.policy_gradient_loss,
self.policy_gradient_entropy_loss])
return self.total_loss
parray_ops.swap_time_major(rollouts.rewards),
baseline_values,
bootstrap_values)
advantages = parray_ops.swap_time_major(vtrace_returns.pg_advantages)
advantages = normalization_ops.weighted_moments_normalize(advantages, rollouts.weights)
advantages = gen_array_ops.stop_gradient(advantages)
ratio = parray_ops.swap_time_major(gen_math_ops.exp(log_rhos))
clipped_ratio = clip_ops.clip_by_value(ratio, 1. - ratio_epsilon, 1. + ratio_epsilon)
self.policy_gradient_loss = -losses_impl.compute_weighted_loss(
gen_math_ops.minimum(advantages * ratio, advantages * clipped_ratio),
weights=rollouts.weights)
entropy_loss = policy_gradient_ops.policy_entropy_loss(
policy,
self.policy.trainable_variables,
lambda policies: entropy_scale).loss
entropy_loss = parray_ops.expand_to(entropy_loss, ndims=3)
entropy_loss = math_ops.reduce_sum(entropy_loss, axis=-1)
self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
entropy_loss,
weights=rollouts.weights)
self.value_loss = math_ops.reduce_mean(
math_ops.multiply(
math_ops.reduce_sum(math_ops.square(vtrace_returns.vs - baseline_values), axis=0),
baseline_scale * .5),
axis=0)
self.total_loss = math_ops.add_n([
parray_ops.swap_time_major(lambda_))
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
advantages = normalization_ops.normalize_by_moments(advantages, weights)
advantages = gen_array_ops.stop_gradient(advantages)
ratio = gen_math_ops.exp(
policy.log_prob(actions) - gen_array_ops.stop_gradient(
behavioral_policy.log_prob(actions)))
clipped_ratio = clip_ops.clip_by_value(ratio, 1. - ratio_epsilon, 1. + ratio_epsilon)
self.policy_gradient_loss = -losses_impl.compute_weighted_loss(
gen_math_ops.minimum(advantages * ratio, advantages * clipped_ratio),
weights=weights)
entropy_loss = policy_gradient_ops.policy_entropy_loss(
policy,
self.policy.trainable_variables,
lambda policies: entropy_scale).loss
self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
entropy_loss,
weights=weights)
self.value_loss = pmath_ops.safe_divide(
baseline_scale * math_ops.reduce_sum(baseline_loss), total_num)
self.value_loss = gen_array_ops.check_numerics(
self.value_loss, 'value_loss')
self.total_loss = math_ops.add_n([
self.value_loss,
self.policy_gradient_loss,
self.policy_gradient_entropy_loss])
pcontinues,
parray_ops.swap_time_major(rewards),
baseline_values,
bootstrap_values)
advantages = parray_ops.swap_time_major(vtrace_returns.pg_advantages)
if normalize_advantages:
advantages = normalization_ops.normalize_by_moments(advantages, weights)
advantages = gen_array_ops.stop_gradient(advantages)
policy_gradient_loss = advantages * -log_prob
self.policy_gradient_loss = losses_impl.compute_weighted_loss(
policy_gradient_loss,
weights=weights)
entropy_loss = policy_gradient_ops.policy_entropy_loss(
policy,
self.policy.trainable_variables,
lambda policies: entropy_scale).loss
self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
entropy_loss,
weights=weights)
baseline_loss = math_ops.reduce_sum(
math_ops.square(vtrace_returns.vs - baseline_values), axis=0)
self.value_loss = pmath_ops.safe_divide(
.5 * baseline_scale * math_ops.reduce_sum(baseline_loss), total_num)
self.value_loss = gen_array_ops.check_numerics(
self.value_loss, 'value_loss')
self.total_loss = math_ops.add_n([
self.value_loss,
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
if normalize_advantages:
advantages = normalization_ops.normalize_by_moments(advantages, weights)
advantages = gen_array_ops.check_numerics(advantages, 'advantages')
policy = self.policy(states, training=True)
log_prob = policy.log_prob(actions)
policy_gradient_loss = gen_array_ops.stop_gradient(advantages) * -log_prob
self.policy_gradient_loss = losses_impl.compute_weighted_loss(
policy_gradient_loss,
weights=weights)
self.policy_gradient_loss = gen_array_ops.check_numerics(
self.policy_gradient_loss, 'policy_gradient_loss')
entropy_loss = policy_gradient_ops.policy_entropy_loss(
policy,
self.policy.trainable_variables,
lambda policies: entropy_scale).loss
self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
entropy_loss,
weights=weights)
self.policy_gradient_entropy_loss = gen_array_ops.check_numerics(
self.policy_gradient_entropy_loss, 'policy_gradient_entropy_loss')
self.value_loss = pmath_ops.safe_divide(
baseline_scale * math_ops.reduce_sum(baseline_loss), total_num)
self.value_loss = gen_array_ops.check_numerics(
self.value_loss, 'value_loss')
self.total_loss = math_ops.add_n([
self.value_loss,