Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _discounted_returns(rewards, decay, weights):
"""Compute the discounted returns given the decay factor."""
sequence_lengths = math_ops.reduce_sum(weights, axis=1)
bootstrap_values = indexing_ops.batched_index(
rewards, math_ops.cast(sequence_lengths - 1, dtypes.int32))
multi_step_returns = sequence_ops.scan_discounted_sum(
parray_ops.swap_time_major(rewards * weights),
parray_ops.swap_time_major(decay * weights),
bootstrap_values,
reverse=True,
back_prop=False)
return parray_ops.swap_time_major(multi_step_returns)
# exp_q_t = 𝔼_π Q(x_{t+1},.)
# qa_t = Q(x_t, a_t)
# Hence:
# T_tm1 = (r_t + γ * exp_q_t - c_t * qa_t) + γ * c_t * T_t
# Define:
# current = r_t + γ * (exp_q_t - c_t * qa_t)
# Thus:
# T_tm1 = scan_discounted_sum(current, γ * c_t, reverse=True)
args = [r_t, pcont_t, target_policy_t, c_t, q_t, a_t]
with tf.name_scope(
name, 'general_returns_based_off_policy_target', values=args):
exp_q_t = tf.reduce_sum(target_policy_t * q_t, axis=2)
qa_t = indexing_ops.batched_index(q_t, a_t)
current = r_t + pcont_t * (exp_q_t - c_t * qa_t)
initial_value = qa_t[-1]
return sequence_ops.scan_discounted_sum(
current,
pcont_t * c_t,
initial_value,
reverse=True,
back_prop=back_prop)
lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
name: Customises the name_scope for this op.
Returns:
2-D Tensor with shape `[T, B]`
"""
values.get_shape().assert_has_rank(2)
rewards.get_shape().assert_has_rank(2)
pcontinues.get_shape().assert_has_rank(2)
bootstrap_value.get_shape().assert_has_rank(1)
scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
with tf.name_scope(name, values=scoped_values):
if lambda_ == 1:
# This is actually equivalent to the branch below, just an optimisation
# to avoid unnecessary work in this case:
return sequence_ops.scan_discounted_sum(
rewards,
pcontinues,
initial_value=bootstrap_value,
reverse=True,
back_prop=False,
name="multistep_returns")
else:
v_tp1 = tf.concat(
axis=0, values=[values[1:, :],
tf.expand_dims(bootstrap_value, 0)])
# `back_prop=False` prevents gradients flowing into values and
# bootstrap_value, which is what you want when using the bootstrapped
# lambda-returns in an update as targets for values.
return sequence_ops.multistep_forward_view(
rewards,
pcontinues,
def discounted_returns(self, decay):
"""Compute the discounted returns given the decay factor.
"""
decay = ops.convert_to_tensor(decay)
sequence = parray_ops.swap_time_major(self.rewards)
decay = gen_array_ops.broadcast_to(decay, array_ops.shape(sequence))
multi_step_returns = sequence_ops.scan_discounted_sum(
sequence, decay, array_ops.zeros_like(sequence[0]), reverse=True, back_prop=False)
return parray_ops.swap_time_major(multi_step_returns)