Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# to allow network sharing between policy and value networks. This makes 'policy' and 'value_function'
# imbalanced semantically (though they are naturally imbalanced since 'policy' is required to interact
# with the environment and 'value_function' is not). I have an idea to solve this imbalance, which is
# not based on passing function or overriding function.
### 2. build policy, loss, optimizer
pi = policy.OnehotCategorical(my_policy, observation_placeholder=observation_ph, weight_update=0)
ppo_loss_clip = losses.ppo_clip(pi, clip_param)
total_loss = ppo_loss_clip
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)
### 3. define data collection
training_data = Batch(env, pi, advantage_estimation.full_return)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign pi to pi_old
pi.sync_weights() # TODO: automate this for policies with target network
start_time = time.time()
for i in range(100):
# collect data
training_data.collect(num_episodes=50)
# print current return
return None, action_values # no policy head
### 2. build policy, loss, optimizer
dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, has_old_net=True)
pi = policy.DQN(dqn)
dqn_loss = losses.qlearning(dqn)
total_loss = dqn_loss
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=list(dqn.trainable_variables))
### 3. define data collection
replay_buffer = VanillaReplayBuffer(capacity=2e4, nstep=1)
process_functions = [advantage_estimation.nstep_q_return(1, dqn)]
managed_networks = [dqn]
data_collector = DataCollector(
env=env,
policy=pi,
data_buffer=replay_buffer,
process_functions=process_functions,
managed_networks=managed_networks
)
### 4. start training
# hyper-parameters
batch_size = 32
replay_buffer_warmup = 1000
epsilon_decay_interval = 500
epsilon = 0.6
action_values = tf.layers.dense(net, action_dim, activation=None)
return None, action_values # no policy head
### 2. build policy, loss, optimizer
dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, weight_update=100)
pi = policy.DQN(dqn)
dqn_loss = losses.qlearning(dqn)
total_loss = dqn_loss
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=dqn.trainable_variables)
### 3. define data collection
data_collector = Batch(env, pi, [advantage_estimation.nstep_q_return(1, dqn)], [dqn])
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign actor to pi_old
pi.sync_weights() # TODO: automate this for policies with target network
start_time = time.time()
#TODO : repeat_num shoulde be defined in some configuration files
repeat_num = 100
for i in range(repeat_num):
# collect data
data_collector.collect(num_episodes=50, epsilon_greedy= (repeat_num - i + 0.0) / repeat_num)
### 2. build policy, critic, loss, optimizer
actor = policy.OnehotCategorical(my_network, observation_placeholder=observation_ph, weight_update=1)
critic = value_function.StateValue(my_network, observation_placeholder=observation_ph)
actor_loss = losses.REINFORCE(actor)
critic_loss = losses.value_mse(critic)
actor_optimizer = tf.train.AdamOptimizer(1e-4)
actor_train_op = actor_optimizer.minimize(actor_loss, var_list=actor.trainable_variables)
critic_optimizer = tf.train.RMSPropOptimizer(1e-4)
critic_train_op = critic_optimizer.minimize(critic_loss, var_list=critic.trainable_variables)
### 3. define data collection
data_collector = Batch(env, actor,
[advantage_estimation.gae_lambda(1, critic), advantage_estimation.nstep_return(1, critic)],
[actor, critic])
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
start_time = time.time()
for i in range(100):
# collect data
data_collector.collect(num_episodes=20)
# print current return
print('Epoch {}:'.format(i))
data_collector.statistics()
pi = policy.Distributional(my_policy, observation_placeholder=observation_ph, has_old_net=True)
ppo_loss_clip = losses.ppo_clip(pi, clip_param)
total_loss = ppo_loss_clip
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=list(pi.trainable_variables))
### 3. define data collection
data_buffer = BatchSet()
data_collector = DataCollector(
env=env,
policy=pi,
data_buffer=data_buffer,
process_functions=[advantage_estimation.full_return],
managed_networks=[pi],
)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign actor to pi_old
pi.sync_weights()
start_time = time.time()
for i in range(1000):
# collect data
data_collector.collect(num_episodes=50)
### 2. build policy, critic, loss, optimizer
print('actor and critic will share the first two layers in this case, and the third layer will cause error')
actor = policy.OnehotCategorical(my_actor, observation_placeholder=observation_ph, weight_update=1)
critic = value_function.StateValue(my_critic, observation_placeholder=observation_ph)
actor_loss = losses.vanilla_policy_gradient(actor)
critic_loss = losses.value_mse(critic)
total_loss = actor_loss + critic_loss
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=actor.trainable_variables)
### 3. define data collection
training_data = Batch(env, actor, advantage_estimation.full_return)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign actor to pi_old
actor.sync_weights() # TODO: automate this for policies with target network
start_time = time.time()
for i in range(100):
# collect data
training_data.collect(num_episodes=20)
# print current return
critic = ts.value_function.ActionValue(my_network, observation_placeholder=observation_ph,
action_placeholder=action_ph, has_old_net=True)
soft_update_op = ts.get_soft_update_op(1e-2, [actor, critic])
critic_loss = ts.losses.value_mse(critic)
critic_optimizer = tf.train.AdamOptimizer(1e-3)
critic_train_op = critic_optimizer.minimize(critic_loss, var_list=list(critic.trainable_variables))
dpg_grads_vars = ts.opt.DPG(actor, critic)
actor_optimizer = tf.train.AdamOptimizer(1e-3)
actor_train_op = actor_optimizer.apply_gradients(dpg_grads_vars)
### 3. define data collection
data_buffer = ts.data.VanillaReplayBuffer(capacity=10000, nstep=1)
process_functions = [ts.data.advantage_estimation.ddpg_return(actor, critic)]
data_collector = ts.data.DataCollector(
env=env,
policy=actor,
data_buffer=data_buffer,
process_functions=process_functions,
managed_networks=[actor, critic]
)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign actor to pi_old
# 2. build losses, optimizers
pi = policy.OnehotCategorical(action_logits, observation_placeholder=observation) # YongRen: policy.Gaussian (could reference the policy in TRPO paper, my code is adapted from zhusuan.distributions) policy.DQN etc.
# for continuous action space, you may need to change an environment to run
pi_old = policy.OnehotCategorical(action_logits_old, observation_placeholder=observation)
action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
advantage = tf.placeholder(dtype=tf.float32, shape=[None]) # advantage values used in the Gradients
ppo_loss_clip = losses.ppo_clip(action, advantage, clip_param, pi, pi_old) # TongzhengRen: losses.vpg ... management of placeholders and feed_dict
total_loss = ppo_loss_clip
optimizer = tf.train.AdamOptimizer(1e-3)
train_op = optimizer.minimize(total_loss, var_list=train_var_list)
# 3. define data collection
training_data = Batch(env, pi, advantage_estimation.full_return) # YouQiaoben: finish and polish Batch, advantage_estimation.gae_lambda as in PPO paper
# ShihongSong: Replay(), see dqn_example.py
# maybe a dict to manage the elements to be collected
# 4. start training
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
minibatch_count = 0
collection_count = 0
while True: # until some stopping criterion met...
# collect data
training_data.collect(num_episodes=2) # YouQiaoben, ShihongSong
collection_count += 1
print('Collected {} times.'.format(collection_count))
# update network
is_training_ph = tf.placeholder(tf.bool, shape=())
keep_prob_ph = tf.placeholder(tf.float32, shape=())
my_policy = MyPolicy(observation_ph, is_training_ph, keep_prob_ph, action_dim)
### 2. build policy, loss, optimizer
pi = policy.Normal(my_policy, observation_placeholder=observation_ph, weight_update=0)
ppo_loss_clip = losses.ppo_clip(pi, clip_param)
total_loss = ppo_loss_clip
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)
### 3. define data collection
training_data = Batch(env, pi, advantage_estimation.full_return)
### 4. start training
feed_dict_train = {is_training_ph: True, keep_prob_ph: 0.8}
feed_dict_test = {is_training_ph: False, keep_prob_ph: 1}
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# assign pi to pi_old
pi.sync_weights() # TODO: automate this for policies with target network
start_time = time.time()
for i in range(100):
# collect data
optimizer = tf.train.AdamOptimizer(1e-4)
# this hack would be unnecessary if we have a `SharedPolicyValue` class, or hack the trainable_variables management
var_list = list(actor.trainable_variables | critic.trainable_variables)
train_op = optimizer.minimize(total_loss, var_list=var_list)
### 3. define data collection
data_buffer = ts.data.BatchSet()
data_collector = ts.data.DataCollector(
env=env,
policy=actor,
data_buffer=data_buffer,
process_functions=[ts.data.advantage_estimation.nstep_return(n=3, value_function=critic, return_advantage=True)],
managed_networks=[actor, critic],
)
### 4. start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
start_time = time.time()
for i in range(1000):
# collect data
data_collector.collect(num_episodes=50)
# print current return
print('Epoch {}:'.format(i))