From 1918f160796525d2b5f241f81d3224a941418935 Mon Sep 17 00:00:00 2001
From: Itai Caspi <itai.caspi@intel.com>
Date: Wed, 25 Oct 2017 16:07:58 +0300
Subject: [PATCH] imporved API for getting / setting variables within the graph

---
 agents/clipped_ppo_agent.py                   |  3 +-
 agents/ppo_agent.py                           | 29 ++++++++++---------
 architectures/architecture.py                 |  6 ++++
 .../tensorflow_components/architecture.py     | 18 ++++++++++++
 architectures/tensorflow_components/heads.py  |  6 +++-
 5 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/agents/clipped_ppo_agent.py b/agents/clipped_ppo_agent.py
index cadfc0b..b22386d 100644
--- a/agents/clipped_ppo_agent.py
+++ b/agents/clipped_ppo_agent.py
@@ -16,7 +16,6 @@
 
 from agents.actor_critic_agent import *
 from random import shuffle
-import tensorflow as tf
 
 
 # Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
@@ -132,7 +131,7 @@ class ClippedPPOAgent(ActorCriticAgent):
                 loss[key] = np.mean(loss[key], 0)
 
             if self.tp.learning_rate_decay_rate != 0:
-                curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
+                curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate)
                 self.curr_learning_rate.add_sample(curr_learning_rate)
             else:
                 curr_learning_rate = self.tp.learning_rate
diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py
index 3ad2481..990abd8 100644
--- a/agents/ppo_agent.py
+++ b/agents/ppo_agent.py
@@ -16,7 +16,6 @@
 
 from agents.actor_critic_agent import *
 from random import shuffle
-import tensorflow as tf
 
 
 # Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
@@ -35,13 +34,6 @@ class PPOAgent(ActorCriticAgent):
                                              self.replicated_device, self.worker_device)
         self.networks.append(self.policy_network)
 
-        # operations for changing the kl coefficient
-        self.kl_coefficient = tf.placeholder('float', name='kl_coefficient')
-        self.increase_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
-                                                 self.kl_coefficient * 1.5)
-        self.decrease_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
-                                                 self.kl_coefficient / 1.5)
-
         # signals definition
         self.value_loss = Signal('Value Loss')
         self.signals.append(self.value_loss)
@@ -180,7 +172,7 @@ class PPOAgent(ActorCriticAgent):
                 loss[key] = np.mean(loss[key], 0)
 
             if self.tp.learning_rate_decay_rate != 0:
-                curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
+                curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate)
                 self.curr_learning_rate.add_sample(curr_learning_rate)
             else:
                 curr_learning_rate = self.tp.learning_rate
@@ -209,15 +201,24 @@ class PPOAgent(ActorCriticAgent):
 
         # update kl coefficient
         kl_target = self.tp.agent.target_kl_divergence
-        kl_coefficient = self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)
+        kl_coefficient = self.policy_network.online_network.get_variable_value(
+            self.policy_network.online_network.output_heads[0].kl_coefficient)
+        new_kl_coefficient = kl_coefficient
         if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
             # kl too high => increase regularization
-            self.tp.sess.run(self.increase_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
+            new_kl_coefficient *= 1.5
         elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
             # kl too low => decrease regularization
-            self.tp.sess.run(self.decrease_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
-        screen.log_title("KL penalty coefficient change = {} -> {}".format(
-            kl_coefficient, self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)))
+            new_kl_coefficient /= 1.5
+
+        # update the kl coefficient variable
+        if kl_coefficient != new_kl_coefficient:
+            self.policy_network.online_network.set_variable_value(
+                self.policy_network.online_network.output_heads[0].assign_kl_coefficient,
+                new_kl_coefficient,
+                self.policy_network.online_network.output_heads[0].kl_coefficient_ph)
+
+        screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
 
     def post_training_commands(self):
         if self.tp.agent.use_kl_regularization:
diff --git a/architectures/architecture.py b/architectures/architecture.py
index f98659d..d3175b7 100644
--- a/architectures/architecture.py
+++ b/architectures/architecture.py
@@ -68,3 +68,9 @@ class Architecture(object):
 
     def apply_gradients(self, gradients):
         pass
+
+    def get_variable_value(self, variable):
+        pass
+
+    def set_variable_value(self, assign_op, value, placeholder=None):
+        pass
\ No newline at end of file
diff --git a/architectures/tensorflow_components/architecture.py b/architectures/tensorflow_components/architecture.py
index 6ae0241..19cb99d 100644
--- a/architectures/tensorflow_components/architecture.py
+++ b/architectures/tensorflow_components/architecture.py
@@ -288,3 +288,21 @@ class TensorFlowArchitecture(Architecture):
         """
         summary_writer = tf.summary.FileWriter(summary_dir)
         summary_writer.add_graph(self.sess.graph)
+
+    def get_variable_value(self, variable):
+        """
+        Get the value of a variable from the graph
+        :param variable: the variable
+        :return: the value of the variable
+        """
+        return self.sess.run(variable)
+
+    def set_variable_value(self, assign_op, value, placeholder=None):
+        """
+        Updates the value of a variable. 
+        This requires having an assign operation for the variable, and a placeholder which will provide the value
+        :param assign_op: an assign operation for the variable
+        :param value: a value to set the variable to
+        :param placeholder: a placeholder to hold the given value for injecting it into the variable
+        """
+        self.sess.run(assign_op, feed_dict={placeholder: value})
diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py
index 84b35f1..4d8ec17 100644
--- a/architectures/tensorflow_components/heads.py
+++ b/architectures/tensorflow_components/heads.py
@@ -351,15 +351,19 @@ class PPOHead(Head):
         self.num_actions = tuning_parameters.env_instance.action_space_size
         self.discrete_controls = tuning_parameters.env_instance.discrete_controls
         self.output_scale = np.max(tuning_parameters.env_instance.action_space_abs_range)
+
+        # kl coefficient and its corresponding assignment operation and placeholder
         self.kl_coefficient = tf.Variable(tuning_parameters.agent.initial_kl_coefficient,
                                           trainable=False, name='kl_coefficient')
+        self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
+        self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
+
         self.kl_cutoff = 2*tuning_parameters.agent.target_kl_divergence
         self.high_kl_penalty_coefficient = tuning_parameters.agent.high_kl_penalty_coefficient
         self.clip_likelihood_ratio_using_epsilon = tuning_parameters.agent.clip_likelihood_ratio_using_epsilon
         self.use_kl_regularization = tuning_parameters.agent.use_kl_regularization
         self.beta = tuning_parameters.agent.beta_entropy
 
-
     def _build_module(self, input_layer):
         eps = 1e-15