From 1918f160796525d2b5f241f81d3224a941418935 Mon Sep 17 00:00:00 2001 From: Itai Caspi Date: Wed, 25 Oct 2017 16:07:58 +0300 Subject: [PATCH] imporved API for getting / setting variables within the graph --- agents/clipped_ppo_agent.py | 3 +- agents/ppo_agent.py | 29 ++++++++++--------- architectures/architecture.py | 6 ++++ .../tensorflow_components/architecture.py | 18 ++++++++++++ architectures/tensorflow_components/heads.py | 6 +++- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/agents/clipped_ppo_agent.py b/agents/clipped_ppo_agent.py index cadfc0b..b22386d 100644 --- a/agents/clipped_ppo_agent.py +++ b/agents/clipped_ppo_agent.py @@ -16,7 +16,6 @@ from agents.actor_critic_agent import * from random import shuffle -import tensorflow as tf # Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 @@ -132,7 +131,7 @@ class ClippedPPOAgent(ActorCriticAgent): loss[key] = np.mean(loss[key], 0) if self.tp.learning_rate_decay_rate != 0: - curr_learning_rate = self.tp.sess.run(self.tp.learning_rate) + curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate) self.curr_learning_rate.add_sample(curr_learning_rate) else: curr_learning_rate = self.tp.learning_rate diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py index 3ad2481..990abd8 100644 --- a/agents/ppo_agent.py +++ b/agents/ppo_agent.py @@ -16,7 +16,6 @@ from agents.actor_critic_agent import * from random import shuffle -import tensorflow as tf # Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf @@ -35,13 +34,6 @@ class PPOAgent(ActorCriticAgent): self.replicated_device, self.worker_device) self.networks.append(self.policy_network) - # operations for changing the kl coefficient - self.kl_coefficient = tf.placeholder('float', name='kl_coefficient') - self.increase_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient, - self.kl_coefficient * 1.5) - self.decrease_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient, - self.kl_coefficient / 1.5) - # signals definition self.value_loss = Signal('Value Loss') self.signals.append(self.value_loss) @@ -180,7 +172,7 @@ class PPOAgent(ActorCriticAgent): loss[key] = np.mean(loss[key], 0) if self.tp.learning_rate_decay_rate != 0: - curr_learning_rate = self.tp.sess.run(self.tp.learning_rate) + curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate) self.curr_learning_rate.add_sample(curr_learning_rate) else: curr_learning_rate = self.tp.learning_rate @@ -209,15 +201,24 @@ class PPOAgent(ActorCriticAgent): # update kl coefficient kl_target = self.tp.agent.target_kl_divergence - kl_coefficient = self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient) + kl_coefficient = self.policy_network.online_network.get_variable_value( + self.policy_network.online_network.output_heads[0].kl_coefficient) + new_kl_coefficient = kl_coefficient if self.total_kl_divergence_during_training_process > 1.3 * kl_target: # kl too high => increase regularization - self.tp.sess.run(self.increase_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient}) + new_kl_coefficient *= 1.5 elif self.total_kl_divergence_during_training_process < 0.7 * kl_target: # kl too low => decrease regularization - self.tp.sess.run(self.decrease_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient}) - screen.log_title("KL penalty coefficient change = {} -> {}".format( - kl_coefficient, self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient))) + new_kl_coefficient /= 1.5 + + # update the kl coefficient variable + if kl_coefficient != new_kl_coefficient: + self.policy_network.online_network.set_variable_value( + self.policy_network.online_network.output_heads[0].assign_kl_coefficient, + new_kl_coefficient, + self.policy_network.online_network.output_heads[0].kl_coefficient_ph) + + screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient)) def post_training_commands(self): if self.tp.agent.use_kl_regularization: diff --git a/architectures/architecture.py b/architectures/architecture.py index f98659d..d3175b7 100644 --- a/architectures/architecture.py +++ b/architectures/architecture.py @@ -68,3 +68,9 @@ class Architecture(object): def apply_gradients(self, gradients): pass + + def get_variable_value(self, variable): + pass + + def set_variable_value(self, assign_op, value, placeholder=None): + pass \ No newline at end of file diff --git a/architectures/tensorflow_components/architecture.py b/architectures/tensorflow_components/architecture.py index 6ae0241..19cb99d 100644 --- a/architectures/tensorflow_components/architecture.py +++ b/architectures/tensorflow_components/architecture.py @@ -288,3 +288,21 @@ class TensorFlowArchitecture(Architecture): """ summary_writer = tf.summary.FileWriter(summary_dir) summary_writer.add_graph(self.sess.graph) + + def get_variable_value(self, variable): + """ + Get the value of a variable from the graph + :param variable: the variable + :return: the value of the variable + """ + return self.sess.run(variable) + + def set_variable_value(self, assign_op, value, placeholder=None): + """ + Updates the value of a variable. + This requires having an assign operation for the variable, and a placeholder which will provide the value + :param assign_op: an assign operation for the variable + :param value: a value to set the variable to + :param placeholder: a placeholder to hold the given value for injecting it into the variable + """ + self.sess.run(assign_op, feed_dict={placeholder: value}) diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py index 84b35f1..4d8ec17 100644 --- a/architectures/tensorflow_components/heads.py +++ b/architectures/tensorflow_components/heads.py @@ -351,15 +351,19 @@ class PPOHead(Head): self.num_actions = tuning_parameters.env_instance.action_space_size self.discrete_controls = tuning_parameters.env_instance.discrete_controls self.output_scale = np.max(tuning_parameters.env_instance.action_space_abs_range) + + # kl coefficient and its corresponding assignment operation and placeholder self.kl_coefficient = tf.Variable(tuning_parameters.agent.initial_kl_coefficient, trainable=False, name='kl_coefficient') + self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph') + self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph) + self.kl_cutoff = 2*tuning_parameters.agent.target_kl_divergence self.high_kl_penalty_coefficient = tuning_parameters.agent.high_kl_penalty_coefficient self.clip_likelihood_ratio_using_epsilon = tuning_parameters.agent.clip_likelihood_ratio_using_epsilon self.use_kl_regularization = tuning_parameters.agent.use_kl_regularization self.beta = tuning_parameters.agent.beta_entropy - def _build_module(self, input_layer): eps = 1e-15