Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with tf.variable_scope(scope):
mu = tf.layers.dense(
hidden_policy,
self.act_size[0],
activation=None,
name="mu",
kernel_initializer=LearningModel.scaled_init(0.01),
)
# Policy-dependent log_sigma_sq
log_sigma_sq = tf.layers.dense(
hidden_policy,
self.act_size[0],
activation=None,
name="log_std",
kernel_initializer=LearningModel.scaled_init(0.01),
)
self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX)
sigma_sq = tf.exp(self.log_sigma_sq)
# Do the reparameterization trick
policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq
_gauss_pre = -0.5 * (
((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON)) ** 2
+ 2 * self.log_sigma_sq
+ np.log(2 * np.pi)
)
all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)
self.memory_in[:, _half_point:],
self.sequence_length,
name="lstm_value",
)
self.memory_out = tf.concat(
[memory_policy_out, memory_value_out], axis=1, name="recurrent_out"
)
else:
hidden_policy = hidden_streams[0]
hidden_value = hidden_streams[1]
mu = tf.layers.dense(
hidden_policy,
self.act_size[0],
activation=None,
kernel_initializer=LearningModel.scaled_init(0.01),
reuse=tf.AUTO_REUSE,
)
self.log_sigma_sq = tf.get_variable(
"log_sigma_squared",
[self.act_size[0]],
dtype=tf.float32,
initializer=tf.zeros_initializer(),
)
sigma_sq = tf.exp(self.log_sigma_sq)
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
# Clip and scale output to ensure actions are always within [-1, 1] range.
from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType
LOG_STD_MAX = 2
LOG_STD_MIN = -20
EPSILON = 1e-6 # Small value to avoid divide by zero
DISCRETE_TARGET_ENTROPY_SCALE = 0.2 # Roughly equal to e-greedy 0.05
CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0 # TODO: Make these an optional hyperparam.
LOGGER = logging.getLogger("mlagents.trainers")
POLICY_SCOPE = ""
TARGET_SCOPE = "target_network"
class SACNetwork(LearningModel):
"""
Base class for an SAC network. Implements methods for creating the actor and critic heads.
"""
def __init__(
self,
brain,
m_size=None,
h_size=128,
normalize=False,
use_recurrent=False,
num_layers=2,
stream_names=None,
seed=0,
vis_encode_type=EncoderType.SIMPLE,
):
import logging
from typing import Optional
import numpy as np
from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
logger = logging.getLogger("mlagents.trainers")
class PPOModel(LearningModel):
def __init__(
self,
brain,
lr=1e-4,
lr_schedule=LearningRateSchedule.LINEAR,
h_size=128,
epsilon=0.2,
beta=1e-3,
max_step=5e6,
normalize=False,
use_recurrent=False,
num_layers=2,
m_size=None,
seed=0,
stream_names=None,
vis_encode_type=EncoderType.SIMPLE,
if self.policy_model.vis_obs_size > 0:
self.expert_visual_in: List[tf.Tensor] = []
visual_policy_encoders = []
visual_expert_encoders = []
for i in range(self.policy_model.vis_obs_size):
# Create input ops for next (t+1) visual observations.
visual_input = self.policy_model.create_visual_input(
self.policy_model.brain.camera_resolutions[i],
name="gail_visual_observation_" + str(i),
)
self.expert_visual_in.append(visual_input)
encoded_policy_visual = self.policy_model.create_visual_observation_encoder(
self.policy_model.visual_in[i],
self.encoding_size,
LearningModel.swish,
1,
"gail_stream_{}_visual_obs_encoder".format(i),
False,
)
encoded_expert_visual = self.policy_model.create_visual_observation_encoder(
self.expert_visual_in[i],
self.encoding_size,
LearningModel.swish,
1,
"gail_stream_{}_visual_obs_encoder".format(i),
True,
)
visual_policy_encoders.append(encoded_policy_visual)
visual_expert_encoders.append(encoded_expert_visual)
hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
def __init__(
self,
brain,
h_size=128,
lr=1e-4,
n_layers=2,
m_size=128,
normalize=False,
use_recurrent=False,
seed=0,
):
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
num_streams = 1
hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
hidden = hidden_streams[0]
self.dropout_rate = tf.placeholder(
dtype=tf.float32, shape=[], name="dropout_rate"
)
hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
if self.use_recurrent:
tf.Variable(
self.m_size, name="memory_size", trainable=False, dtype=tf.int32
)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_reg, self.memory_out = self.create_recurrent_encoder(
hidden_reg, self.memory_in, self.sequence_length
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.
:param brain: BrainInfo used to generate specific network graph.
:param lr: Learning rate.
:param h_size: Size of hidden layers
:param epsilon: Value for policy-divergence threshold.
:param beta: Strength of entropy regularization.
:return: a sub-class of PPOAgent tailored to the environment.
:param max_step: Total number of training steps.
:param normalize: Whether to normalize vector observation input.
:param use_recurrent: Whether to use an LSTM layer in the network.
:param num_layers Number of hidden layers between encoded input and policy & value layers
:param m_size: Size of brain memory.
"""
with tf.variable_scope(scope):
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
self.use_curiosity = use_curiosity
self.with_heuristics = with_heuristics
if num_layers < 1:
num_layers = 1
self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
if brain.vector_action_space_type == "continuous":
self.create_cc_actor_critic(h_size, num_layers)
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
else:
self.create_dc_actor_critic(h_size, num_layers)
if self.use_curiosity:
self.curiosity_enc_size = curiosity_enc_size
self.curiosity_strength = curiosity_strength
encoded_state, encoded_next_state = self.create_curiosity_encoders()
self.create_inverse_model(encoded_state, encoded_next_state)
self.create_forward_model(encoded_state, encoded_next_state)
import tensorflow as tf
import tensorflow.contrib.layers as c_layers
from mlagents.trainers.models import LearningModel
class BehavioralCloningModel(LearningModel):
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128,
normalize=False, use_recurrent=False, scope='PPO', seed=0):
with tf.variable_scope(scope):
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
num_streams = 1
hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
hidden = hidden_streams[0]
self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate")
hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in,
self.sequence_length)
self.memory_out = tf.identity(self.memory_out, name='recurrent_out')
from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType
LOG_STD_MAX = 2
LOG_STD_MIN = -20
EPSILON = 1e-6 # Small value to avoid divide by zero
DISCRETE_TARGET_ENTROPY_SCALE = 0.2 # Roughly equal to e-greedy 0.05
CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0 # TODO: Make these an optional hyperparam.
LOGGER = logging.getLogger("mlagents.trainers")
POLICY_SCOPE = ""
TARGET_SCOPE = "target_network"
class SACNetwork(LearningModel):
"""
Base class for an SAC network. Implements methods for creating the actor and critic heads.
"""
def __init__(
self,
brain,
m_size=None,
h_size=128,
normalize=False,
use_recurrent=False,
num_layers=2,
stream_names=None,
seed=0,
vis_encode_type=EncoderType.SIMPLE,
):
def __init__(
self,
brain,
h_size=128,
lr=1e-4,
n_layers=2,
m_size=128,
normalize=False,
use_recurrent=False,
seed=0,
):
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
num_streams = 1
hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
hidden = hidden_streams[0]
self.dropout_rate = tf.placeholder(
dtype=tf.float32, shape=[], name="dropout_rate"
)
hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
if self.use_recurrent:
tf.Variable(
self.m_size, name="memory_size", trainable=False, dtype=tf.int32
)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_reg, self.memory_out = self.create_recurrent_encoder(
hidden_reg, self.memory_in, self.sequence_length