Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Args:
terminal: The 1D tensor (bool) of terminal signals to process (more than one True within that list is ok).
reward: The 1D tensor (float) of rewards to process.
Returns: Tf op to fetch when `observe()` is called.
"""
# Increment episode
num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
increment_episode = tf.assign_add(ref=self.episode, value=tf.to_int64(x=num_episodes))
increment_global_episode = tf.assign_add(ref=self.global_episode, value=tf.to_int64(x=num_episodes))
with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)):
# Stop gradients
fn = (lambda x: tf.stop_gradient(input=x[:self.list_buffer_index[index]]))
states = util.map_tensors(fn=fn, tensors=self.list_states_buffer, index=index)
internals = util.map_tensors(fn=fn, tensors=self.list_internals_buffer, index=index)
actions = util.map_tensors(fn=fn, tensors=self.list_actions_buffer, index=index)
terminal = tf.stop_gradient(input=terminal)
reward = tf.stop_gradient(input=reward)
# Observation
observation = self.fn_observe_timestep(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward
)
with tf.control_dependencies(control_inputs=(observation,)):
# Reset buffer index.
reset_index = tf.assign(ref=self.list_buffer_index[index], value=0)
internals (any): Internal list.
terminal (bool): boolean indicating if the episode terminated after the observation.
reward (float): scalar reward that resulted from executing the action.
Returns: Tf op to fetch when `observe()` is called.
"""
# Increment episode
num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
increment_episode = tf.assign_add(ref=self.episode, value=tf.to_int64(x=num_episodes))
increment_global_episode = tf.assign_add(ref=self.global_episode, value=tf.to_int64(x=num_episodes))
with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)):
# Stop gradients
# Not using buffers here.
states = util.map_tensors(fn=tf.stop_gradient, tensors=states)
internals = util.map_tensors(fn=tf.stop_gradient, tensors=internals)
actions = util.map_tensors(fn=tf.stop_gradient, tensors=actions)
terminal = tf.stop_gradient(input=terminal)
reward = tf.stop_gradient(input=reward)
# Observation
observation = self.fn_observe_timestep(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward
)
with tf.control_dependencies(control_inputs=(observation,)):
# Trivial operation to enforce control dependency.
self.unbuffered_episode_output = self.global_episode + 0
# internals=internals,
# update=update,
# deterministic=deterministic
# )
# self.fn_loss_per_instance(
# states=states,
# internals=internals,
# actions=actions,
# terminal=terminal,
# reward=reward,
# update=update
# )
self.fn_initialize()
# Input tensors
states = util.map_tensors(fn=tf.identity, tensors=self.states_input)
internals = util.map_tensors(fn=tf.identity, tensors=self.internals_input)
actions = util.map_tensors(fn=tf.identity, tensors=self.actions_input)
terminal = tf.identity(input=self.terminal_input)
reward = tf.identity(input=self.reward_input)
# Probably both deterministic and independent should be the same at some point.
deterministic = tf.identity(input=self.deterministic_input)
independent = tf.identity(input=self.independent_input)
states, actions, reward = self.fn_preprocess(states=states, actions=actions, reward=reward)
self.create_operations(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward,
terminal (bool): boolean indicating if the episode terminated after the observation.
reward (float): scalar reward that resulted from executing the action.
Returns: Tf op to fetch when `observe()` is called.
"""
# Increment episode
num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
increment_episode = tf.assign_add(ref=self.episode, value=tf.to_int64(x=num_episodes))
increment_global_episode = tf.assign_add(ref=self.global_episode, value=tf.to_int64(x=num_episodes))
with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)):
# Stop gradients
# Not using buffers here.
states = util.map_tensors(fn=tf.stop_gradient, tensors=states)
internals = util.map_tensors(fn=tf.stop_gradient, tensors=internals)
actions = util.map_tensors(fn=tf.stop_gradient, tensors=actions)
terminal = tf.stop_gradient(input=terminal)
reward = tf.stop_gradient(input=reward)
# Observation
observation = self.fn_observe_timestep(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward
)
with tf.control_dependencies(control_inputs=(observation,)):
# Trivial operation to enforce control dependency.
self.unbuffered_episode_output = self.global_episode + 0
actions (any): One action (usually a value tuple) or dict of states if multiple actions are expected.
internals (any): Internal list.
terminal (bool): boolean indicating if the episode terminated after the observation.
reward (float): scalar reward that resulted from executing the action.
Returns: Tf op to fetch when `observe()` is called.
"""
# Increment episode
num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
increment_episode = tf.assign_add(ref=self.episode, value=tf.to_int64(x=num_episodes))
increment_global_episode = tf.assign_add(ref=self.global_episode, value=tf.to_int64(x=num_episodes))
with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)):
# Stop gradients
# Not using buffers here.
states = util.map_tensors(fn=tf.stop_gradient, tensors=states)
internals = util.map_tensors(fn=tf.stop_gradient, tensors=internals)
actions = util.map_tensors(fn=tf.stop_gradient, tensors=actions)
terminal = tf.stop_gradient(input=terminal)
reward = tf.stop_gradient(input=reward)
# Observation
observation = self.fn_observe_timestep(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward
)
with tf.control_dependencies(control_inputs=(observation,)):
# Trivial operation to enforce control dependency.
terminal: The 1D tensor (bool) of terminal signals to process (more than one True within that list is ok).
reward: The 1D tensor (float) of rewards to process.
Returns: Tf op to fetch when `observe()` is called.
"""
# Increment episode
num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
increment_episode = tf.assign_add(ref=self.episode, value=tf.to_int64(x=num_episodes))
increment_global_episode = tf.assign_add(ref=self.global_episode, value=tf.to_int64(x=num_episodes))
with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)):
# Stop gradients
fn = (lambda x: tf.stop_gradient(input=x[:self.list_buffer_index[index]]))
states = util.map_tensors(fn=fn, tensors=self.list_states_buffer, index=index)
internals = util.map_tensors(fn=fn, tensors=self.list_internals_buffer, index=index)
actions = util.map_tensors(fn=fn, tensors=self.list_actions_buffer, index=index)
terminal = tf.stop_gradient(input=terminal)
reward = tf.stop_gradient(input=reward)
# Observation
observation = self.fn_observe_timestep(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward
)
with tf.control_dependencies(control_inputs=(observation,)):
# Reset buffer index.
reset_index = tf.assign(ref=self.list_buffer_index[index], value=0)
def create_observe_operations(self, terminal, reward):
# Increment episode
num_episodes = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
increment_episode = tf.assign_add(ref=self.episode, value=num_episodes)
increment_global_episode = tf.assign_add(ref=self.global_episode, value=num_episodes)
with tf.control_dependencies(control_inputs=(increment_episode, increment_global_episode)):
# Stop gradients
fn = (lambda x: tf.stop_gradient(input=x[:self.buffer_index]))
states = util.map_tensors(fn=fn, tensors=self.states_buffer)
internals = util.map_tensors(fn=fn, tensors=self.internals_buffer)
actions = util.map_tensors(fn=fn, tensors=self.actions_buffer)
terminal = tf.stop_gradient(input=terminal)
reward = tf.stop_gradient(input=reward)
# Observation
observation = self.fn_observe_timestep(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward
)
with tf.control_dependencies(control_inputs=(observation,)):
# Reset index
reset_index = tf.assign(ref=self.buffer_index, value=0)
# update=update,
# deterministic=deterministic
# )
# self.fn_loss_per_instance(
# states=states,
# internals=internals,
# actions=actions,
# terminal=terminal,
# reward=reward,
# update=update
# )
self.fn_initialize()
# Input tensors
states = util.map_tensors(fn=tf.identity, tensors=self.states_input)
internals = util.map_tensors(fn=tf.identity, tensors=self.internals_input)
actions = util.map_tensors(fn=tf.identity, tensors=self.actions_input)
terminal = tf.identity(input=self.terminal_input)
reward = tf.identity(input=self.reward_input)
# Probably both deterministic and independent should be the same at some point.
deterministic = tf.identity(input=self.deterministic_input)
independent = tf.identity(input=self.independent_input)
states, actions, reward = self.fn_preprocess(states=states, actions=actions, reward=reward)
self.create_operations(
states=states,
internals=internals,
actions=actions,
terminal=terminal,
reward=reward,
deterministic=deterministic,