Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
with tf.name_scope(name, values=flat_policy_vars):
# We want a value that we can minimize along with other losses, and where
# minimizing means driving the policy towards a uniform distribution over
# the actions. We thus scale it by negative one so that it can be simply
# added to other losses.
scale = tf.constant(-1.0, dtype=tf.float32)
if scale_op:
scale *= scale_op(policies)
policies = nest.flatten(policies)
entropy = tf.add_n(
[policy.entropy() for policy in policies], name="entropy")
loss = tf.multiply(scale, entropy, name="entropy_loss")
return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy))
# Targets are evaluated by using only Q values from the target network.
# This provides fixed regression targets until the next target network
# update.
target = _general_off_policy_corrected_multistep_target(
r_t, pcont_t, target_policy_t, c_t, targnet_q_t, a_t,
not stop_targnet_gradients)
if stop_targnet_gradients:
target = tf.stop_gradient(target)
# Regress Q values of the learning network towards the targets evaluated
# by using the target network.
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
delta = target - qa_tm1
loss = 0.5 * tf.square(delta)
return base_ops.LossOutput(
loss, RetraceCoreExtra(retrace_weights=c_t, target=target))
to crop the input observations before computing the pseudo-rewards.
Returns:
A namedtuple with fields:
* `loss`: a tensor containing the batch of losses, shape [B].
* `extra`: a namedtuple with fields:
* `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
* `td_error`: batch of temporal difference errors, shape [B].
Raises:
ValueError: if the shape of `action_values` is not compatible with that of
the pseudo-rewards derived from the observations.
"""
# Useful shapes.
sequence_length, batch_size = base_ops.best_effort_shape(actions)
num_actions = action_values.get_shape().as_list()[-1]
height_width_q = action_values.get_shape().as_list()[2:-1]
# Calculate rewards using the observations. Crop observations if appropriate.
if crop_height_dim[0] is not None:
h_low, h_high = crop_height_dim
observations = observations[:, :, h_low:h_high, :]
if crop_width_dim[0] is not None:
w_low, w_high = crop_width_dim
observations = observations[:, :, :, w_low:w_high]
# Rescale observations by a constant factor.
observations *= tf.constant(scale)
# Compute pseudo-rewards and get their shape.
pseudo_rewards = pixel_control_rewards(observations, cell_size)
height_width = pseudo_rewards.get_shape().as_list()[2:]
# Check that pseudo-rewards and Q-values are compatible in shape.
if height_width != height_width_q:
# Convert logits to distribution, then find greedy policy action in
# state s_t.
q_t_probs = tf.nn.softmax(logits_q_t)
pi_t = tf.argmax(q_t_selector, 1, output_type=tf.int32)
# Compute distribution for greedy action.
p_target_z = _slice_with_actions(q_t_probs, pi_t)
# Project using the Cramer distance
target = tf.stop_gradient(_l2_project(target_z, p_target_z, atoms_tm1))
logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)
loss = tf.nn.softmax_cross_entropy_with_logits(
logits=logit_qa_tm1, labels=target)
return base_ops.LossOutput(loss, Extra(target))
tf.reduce_sum(entropy_loss_op, axis=0),
name="scaled_entropy_loss") # [B].
total_loss = tf.add(total_loss, entropy_loss,
name="total_loss_with_entropy")
else:
entropy = None
entropy_loss = None
extra = SequenceAdvantageActorCriticExtra(
entropy=entropy, entropy_loss=entropy_loss,
baseline_loss=baseline_loss,
policy_gradient_loss=policy_gradient_loss,
advantages=advantages,
discounted_returns=td_lambda.discounted_returns)
return base_ops.LossOutput(total_loss, extra)
[1, 1] + height_width)
pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1])
else:
raise ValueError(
"The discount_factor must be a scalar or a tensor of rank 2."
"instead is a tensor of shape {}".format(
discount_factor.shape.as_list()))
# Compute a QLambda loss of shape [T,BHW]
loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1)
# Take sum over sequence, sum over cells.
expanded_shape = [sequence_length, batch_size] + height_width
spatial_loss = tf.reshape(loss, expanded_shape) # [T,B,H,W].
# Return.
extra = PixelControlExtra(
spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards)
return base_ops.LossOutput(
tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra) # [B]