1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 11:10:20 +01:00

imporved API for getting / setting variables within the graph

This commit is contained in:
Itai Caspi
2017-10-25 16:07:58 +03:00
parent e33b0e8534
commit 1918f16079
5 changed files with 45 additions and 17 deletions

View File

@@ -16,7 +16,6 @@
from agents.actor_critic_agent import *
from random import shuffle
import tensorflow as tf
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
@@ -132,7 +131,7 @@ class ClippedPPOAgent(ActorCriticAgent):
loss[key] = np.mean(loss[key], 0)
if self.tp.learning_rate_decay_rate != 0:
curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate)
self.curr_learning_rate.add_sample(curr_learning_rate)
else:
curr_learning_rate = self.tp.learning_rate

View File

@@ -16,7 +16,6 @@
from agents.actor_critic_agent import *
from random import shuffle
import tensorflow as tf
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
@@ -35,13 +34,6 @@ class PPOAgent(ActorCriticAgent):
self.replicated_device, self.worker_device)
self.networks.append(self.policy_network)
# operations for changing the kl coefficient
self.kl_coefficient = tf.placeholder('float', name='kl_coefficient')
self.increase_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
self.kl_coefficient * 1.5)
self.decrease_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
self.kl_coefficient / 1.5)
# signals definition
self.value_loss = Signal('Value Loss')
self.signals.append(self.value_loss)
@@ -180,7 +172,7 @@ class PPOAgent(ActorCriticAgent):
loss[key] = np.mean(loss[key], 0)
if self.tp.learning_rate_decay_rate != 0:
curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate)
self.curr_learning_rate.add_sample(curr_learning_rate)
else:
curr_learning_rate = self.tp.learning_rate
@@ -209,15 +201,24 @@ class PPOAgent(ActorCriticAgent):
# update kl coefficient
kl_target = self.tp.agent.target_kl_divergence
kl_coefficient = self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)
kl_coefficient = self.policy_network.online_network.get_variable_value(
self.policy_network.online_network.output_heads[0].kl_coefficient)
new_kl_coefficient = kl_coefficient
if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
# kl too high => increase regularization
self.tp.sess.run(self.increase_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
new_kl_coefficient *= 1.5
elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
# kl too low => decrease regularization
self.tp.sess.run(self.decrease_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
screen.log_title("KL penalty coefficient change = {} -> {}".format(
kl_coefficient, self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)))
new_kl_coefficient /= 1.5
# update the kl coefficient variable
if kl_coefficient != new_kl_coefficient:
self.policy_network.online_network.set_variable_value(
self.policy_network.online_network.output_heads[0].assign_kl_coefficient,
new_kl_coefficient,
self.policy_network.online_network.output_heads[0].kl_coefficient_ph)
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
def post_training_commands(self):
if self.tp.agent.use_kl_regularization:

View File

@@ -68,3 +68,9 @@ class Architecture(object):
def apply_gradients(self, gradients):
pass
def get_variable_value(self, variable):
pass
def set_variable_value(self, assign_op, value, placeholder=None):
pass

View File

@@ -288,3 +288,21 @@ class TensorFlowArchitecture(Architecture):
"""
summary_writer = tf.summary.FileWriter(summary_dir)
summary_writer.add_graph(self.sess.graph)
def get_variable_value(self, variable):
"""
Get the value of a variable from the graph
:param variable: the variable
:return: the value of the variable
"""
return self.sess.run(variable)
def set_variable_value(self, assign_op, value, placeholder=None):
"""
Updates the value of a variable.
This requires having an assign operation for the variable, and a placeholder which will provide the value
:param assign_op: an assign operation for the variable
:param value: a value to set the variable to
:param placeholder: a placeholder to hold the given value for injecting it into the variable
"""
self.sess.run(assign_op, feed_dict={placeholder: value})

View File

@@ -351,15 +351,19 @@ class PPOHead(Head):
self.num_actions = tuning_parameters.env_instance.action_space_size
self.discrete_controls = tuning_parameters.env_instance.discrete_controls
self.output_scale = np.max(tuning_parameters.env_instance.action_space_abs_range)
# kl coefficient and its corresponding assignment operation and placeholder
self.kl_coefficient = tf.Variable(tuning_parameters.agent.initial_kl_coefficient,
trainable=False, name='kl_coefficient')
self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
self.kl_cutoff = 2*tuning_parameters.agent.target_kl_divergence
self.high_kl_penalty_coefficient = tuning_parameters.agent.high_kl_penalty_coefficient
self.clip_likelihood_ratio_using_epsilon = tuning_parameters.agent.clip_likelihood_ratio_using_epsilon
self.use_kl_regularization = tuning_parameters.agent.use_kl_regularization
self.beta = tuning_parameters.agent.beta_entropy
def _build_module(self, input_layer):
eps = 1e-15