Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tf_step(self, time, variables, global_variables, **kwargs):
assert all(util.shape(global_var) == util.shape(local_var) for global_var, local_var in zip(global_variables, variables))
local_diffs = self.optimizer.fn_step(time=time, variables=variables, **kwargs)
with tf.control_dependencies(control_inputs=local_diffs):
applied = self.optimizer.apply_step(variables=global_variables, diffs=local_diffs)
with tf.control_dependencies(control_inputs=(applied,)):
update_diffs = list()
for global_var, local_var in zip(global_variables, variables):
diff = global_var - local_var
update_diffs.append(diff)
applied = self.apply_step(variables=variables, diffs=update_diffs)
# TODO: Update time, episode, etc (like in Synchronization)?
# Scope handling
if Module.scope_stack is not None:
for scope in reversed(Module.scope_stack[1:]):
scope.__exit__(None, None, None)
if len(Module.global_scope) > 0:
temp_scope = tf.name_scope(name='/'.join(Module.global_scope))
temp_scope.__enter__()
tensors = util.fmap(function=util.identity_operation, xs=tensors)
# TensorFlow summaries
assert Module.global_summary_step is not None
step = Module.retrieve_tensor(name=Module.global_summary_step)
summaries = list()
for name, tensor in tensors.items():
shape = util.shape(x=tensor)
if shape == ():
summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
elif shape == (-1,):
tensor = tf.math.reduce_sum(input_tensor=tensor, axis=0)
summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
elif shape == (1,):
tensor = tf.squeeze(input=tensor, axis=-1)
summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
elif shape == (-1, 1):
tensor = tf.math.reduce_sum(input_tensor=tf.squeeze(input=tensor, axis=-1), axis=0)
summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
else:
# General tensor as histogram
assert not util.is_iterable(x=label) and label.endswith('-histogram')
summaries.append(tf.summary.histogram(name=name, data=tensor, step=step))
def tf_reference(
self, states, internals, actions, terminal, reward, next_states, next_internals
):
embedding = self.network.apply(x=states, internals=internals)
log_probs = list()
for name, distribution, action in util.zip_items(self.distributions, actions):
parameters = distribution.parametrize(x=embedding)
log_prob = distribution.log_probability(parameters=parameters, action=action)
collapsed_size = util.product(xs=util.shape(log_prob)[1:])
log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size))
log_probs.append(log_prob)
log_probs = tf.concat(values=log_probs, axis=1)
return tf.stop_gradient(input=log_probs)
else:
tf_dtype = util.tf_dtype(dtype=dtype)
# Variable initializer
if isinstance(initializer, util.py_dtype(dtype=dtype)):
initializer = tf.constant(value=initializer, dtype=tf_dtype, shape=shape)
elif isinstance(initializer, np.ndarray):
if initializer.shape != shape:
raise TensorforceError(
"Invalid variable initializer shape: {}.".format(initializer.shape)
)
initializer = tf.constant(value=initializer, dtype=tf_dtype)
elif isinstance(initializer, tf.Tensor):
if util.shape(x=initializer) != shape:
raise TensorforceError(
"Invalid variable initializer shape: {}.".format(util.shape(x=initializer))
)
initializer = initializer
elif not isinstance(initializer, str):
raise TensorforceError("Invalid variable initializer: {}".format(initializer))
elif initializer[:6] == 'normal':
if dtype != 'float':
raise TensorforceError(
message="Invalid variable initializer value for non-float variable: {}.".format(
initializer
)
)
if initializer[6:] == '-relu':
stddev = min(0.1, sqrt(2.0 / util.product(xs=shape[:-1])))
else:
stddev = min(0.1, sqrt(2.0 / (util.product(xs=shape[:-1]) + shape[-1])))
initializer = tf.random.normal(shape=shape, stddev=stddev, dtype=tf_dtype)
def tf_q_delta(self, q_value, next_q_value, terminal, reward):
"""
Creates the deltas (or advantage) of the Q values.
:return: A list of deltas per action
"""
for _ in range(util.rank(q_value) - 1):
terminal = tf.expand_dims(input=terminal, axis=1)
reward = tf.expand_dims(input=reward, axis=1)
multiples = (1,) + util.shape(q_value)[1:]
terminal = tf.tile(input=terminal, multiples=multiples)
reward = tf.tile(input=reward, multiples=multiples)
zeros = tf.zeros_like(tensor=next_q_value)
next_q_value = tf.where(condition=terminal, x=zeros, y=(self.discount * next_q_value))
return reward + next_q_value - q_value # tf.stop_gradient(q_target)
def tf_apply(self, x):
if self.reduction == 'concat':
return tf.reshape(tensor=x, shape=(-1, util.product(xs=util.shape(x)[1:])))
elif self.reduction == 'max':
for _ in range(util.rank(x=x) - 2):
x = tf.reduce_max(input_tensor=x, axis=1)
return x
elif self.reduction == 'mean':
for _ in range(util.rank(x=x) - 2):
x = tf.reduce_mean(input_tensor=x, axis=1)
return x
elif self.reduction == 'product':
for _ in range(util.rank(x=x) - 2):
x = tf.reduce_prod(input_tensor=x, axis=1)
return x
parameters = distribution.parametrize(x=embedding)
target_parameters = target_distribution.parametrize(x=target_embedding)
q_value = self.tf_q_value(
embedding=embedding, parameters=parameters, action=actions[name], name=name
)
# Notice, this is V', not Q' because NAF outputs V(s) separately
next_state_value = target_distribution.states_value(parameters=target_parameters)
delta = self.tf_q_delta(
q_value=q_value, next_q_value=next_state_value, terminal=terminal, reward=reward
)
collapsed_size = util.product(xs=util.shape(delta)[1:])
delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size))
deltas.append(delta)
# Surrogate loss as the mean squared error between actual observed rewards and expected rewards
loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1)
# Optional Huber loss
huber_loss = self.huber_loss.value()
def no_huber_loss():
return tf.square(x=loss_per_instance)
def apply_huber_loss():
return tf.where(
condition=(tf.abs(x=loss_per_instance) <= huber_loss),
def tf_regularize(self, states, internals):
regularization_loss = super().tf_regularize(states=states, internals=internals)
entropies = list()
embedding = self.network.apply(x=states, internals=internals)
for name, distribution in self.distributions.items():
parameters = distribution.parametrize(x=embedding)
entropy = distribution.entropy(parameters=parameters)
collapsed_size = util.product(xs=util.shape(entropy)[1:])
entropy = tf.reshape(tensor=entropy, shape=(-1, collapsed_size))
entropies.append(entropy)
entropies = tf.concat(values=entropies, axis=1)
entropy_per_instance = tf.reduce_mean(input_tensor=entropies, axis=1)
entropy = tf.reduce_mean(input_tensor=entropy_per_instance, axis=0)
# entropy = self.add_summary(label='entropy', name='entropy', tensor=entropy)
entropy_regularization = self.entropy_regularization.value()
regularization_loss = regularization_loss - entropy_regularization * entropy
# def no_entropy_reg():
# return regularization_loss
# def apply_entropy_reg():
# Standard policy gradient log likelihood computation
log_prob = distribution.log_probability(action=action)
fixed_log_prob = fixed_distribution.log_probability(action=action)
log_prob_diff = log_prob - fixed_log_prob
prob_ratio = tf.exp(x=log_prob_diff)
prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, shape_size))
prob_ratios.append(prob_ratio)
entropy = distribution.entropy()
entropy_penalty = -config.entropy_penalty * entropy
entropy_penalty = tf.reshape(tensor=entropy_penalty, shape=(-1, shape_size))
entropy_penalties.append(entropy_penalty)
self.distribution_tensors[name] = list(distribution.get_tensors())
prev_distribution = list(tf.placeholder(dtype=tf.float32, shape=util.shape(tensor, unknown=None)) for tensor in distribution.get_tensors())
self.prev_distribution_tensors[name] = prev_distribution
prev_distribution = distribution.from_tensors(tensors=prev_distribution, deterministic=self.deterministic)
kl_divergence = prev_distribution.kl_divergence(other=distribution)
kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size))
kl_divergences.append(kl_divergence)
entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
entropies.append(entropy)
# The surrogate loss in PPO is the minimum of clipped loss and
# target advantage * prob_ratio, which is the CPO loss
# Presentation on conservative policy iteration:
# https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps
prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=prob_ratios, axis=1), axis=1)
clipped_prob_ratio = tf.clip_by_value(prob_ratio, 1.0 - config.loss_clipping, 1.0 + config.loss_clipping)