ACER algorithm (#184)

* initial ACER commit * Code cleanup + several fixes * Q-retrace bug fix + small clean-ups * added documentation for acer * ACER benchmarks * update benchmarks table * Add nightly running of golden and trace tests. (#202) Resolves #200 * comment out nightly trace tests until values reset. * remove redundant observe ignore (#168) * ensure nightly test env containers exist. (#205) Also bump integration test timeout * wxPython removal (#207) Replacing wxPython with Python's Tkinter. Also removing the option to choose multiple files as it is unused and causes errors, and fixing the load file/directory spinner. * Create CONTRIBUTING.md (#210) * Create CONTRIBUTING.md. Resolves #188 * run nightly golden tests sequentially. (#217) Should reduce resource requirements and potential CPU contention but increases overall execution time. * tests: added new setup configuration + test args (#211) - added utils for future tests and conftest - added test args * new docs build * golden test update
2026-02-15 13:35:55 +01:00 · 2019-02-20 23:52:34 +02:00
parent 7253f511ed
commit 2b5d1dabe6
175 changed files with 2327 additions and 664 deletions
--- a/rl_coach/architectures/tensorflow_components/architecture.py
+++ b/rl_coach/architectures/tensorflow_components/architecture.py
@@ -350,7 +350,7 @@ class TensorFlowArchitecture(Architecture):
                importance_weight = np.ones(target_ph.shape[0])
            else:
                importance_weight = importance_weights[placeholder_idx]
-            importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
+            importance_weight = np.reshape(importance_weight, (-1,) + (1,) * (len(target_ph.shape) - 1))

            feed_dict[self.importance_weights[placeholder_idx]] = importance_weight

--- a/rl_coach/architectures/tensorflow_components/heads/init.py
+++ b/rl_coach/architectures/tensorflow_components/heads/init.py
@@ -11,6 +11,7 @@ from .q_head import QHead
 from .quantile_regression_q_head import QuantileRegressionQHead
 from .rainbow_q_head import RainbowQHead
 from .v_head import VHead
+from .acer_policy_head import ACERPolicyHead

 __all__ = [
    'CategoricalQHead',
@@ -25,5 +26,6 @@ __all__ = [
    'QHead',
    'QuantileRegressionQHead',
    'RainbowQHead',
-    'VHead'
+    'VHead',
+    'ACERPolicyHead'
 ]
--- a/rl_coach/architectures/tensorflow_components/heads/acer_policy_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/acer_policy_head.py
@@ -0,0 +1,126 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+
+from rl_coach.architectures.tensorflow_components.layers import Dense
+from rl_coach.architectures.tensorflow_components.heads.head import Head
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.core_types import ActionProbabilities
+from rl_coach.spaces import DiscreteActionSpace
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+
+class ACERPolicyHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
+                 dense_layer=Dense):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
+                         dense_layer=dense_layer)
+        self.name = 'acer_policy_head'
+        self.return_type = ActionProbabilities
+        self.beta = None
+        self.action_penalty = None
+
+        # a scalar weight that penalizes low entropy values to encourage exploration
+        if hasattr(agent_parameters.algorithm, 'beta_entropy'):
+            # we set the beta value as a tf variable so it can be updated later if needed
+            self.beta = tf.Variable(float(agent_parameters.algorithm.beta_entropy),
+                                    trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
+            self.beta_placeholder = tf.placeholder('float')
+            self.set_beta = tf.assign(self.beta, self.beta_placeholder)
+
+    def _build_module(self, input_layer):
+        if isinstance(self.spaces.action, DiscreteActionSpace):
+            # create a discrete action network (softmax probabilities output)
+            self._build_discrete_net(input_layer, self.spaces.action)
+        else:
+            raise ValueError("only discrete action spaces are supported for ACER")
+
+        if self.is_local:
+            # add entropy regularization
+            if self.beta:
+                self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
+                self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
+            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+            # Truncated importance sampling with bias corrections
+            importance_sampling_weight = tf.placeholder(tf.float32, [None, self.num_actions],
+                                                        name='{}_importance_sampling_weight'.format(self.get_name()))
+            self.input.append(importance_sampling_weight)
+            importance_sampling_weight_i = tf.placeholder(tf.float32, [None],
+                                                          name='{}_importance_sampling_weight_i'.format(self.get_name()))
+            self.input.append(importance_sampling_weight_i)
+
+            V_values = tf.placeholder(tf.float32, [None], name='{}_V_values'.format(self.get_name()))
+            self.target.append(V_values)
+            Q_values = tf.placeholder(tf.float32, [None, self.num_actions], name='{}_Q_values'.format(self.get_name()))
+            self.input.append(Q_values)
+            Q_retrace = tf.placeholder(tf.float32, [None], name='{}_Q_retrace'.format(self.get_name()))
+            self.input.append(Q_retrace)
+
+            action_log_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
+            self.probability_loss = -tf.reduce_mean(action_log_probs_wrt_policy
+                                                    * (Q_retrace - V_values)
+                                                    * tf.minimum(self.ap.algorithm.importance_weight_truncation,
+                                                                 importance_sampling_weight_i))
+
+            log_probs_wrt_policy = tf.log(self.policy_probs + eps)
+            bias_correction_gain = tf.reduce_sum(log_probs_wrt_policy
+                                                 * (Q_values - tf.expand_dims(V_values, 1))
+                                                 * tf.nn.relu(1.0 - (self.ap.algorithm.importance_weight_truncation
+                                                                     / (importance_sampling_weight + eps)))
+                                                 * tf.stop_gradient(self.policy_probs),
+                                                 axis=1)
+            self.bias_correction_loss = -tf.reduce_mean(bias_correction_gain)
+
+            self.loss = self.probability_loss + self.bias_correction_loss
+            tf.losses.add_loss(self.loss)
+
+            # Trust region
+            batch_size = tf.to_float(tf.shape(input_layer)[0])
+            average_policy = tf.placeholder(tf.float32, [None, self.num_actions],
+                                            name='{}_average_policy'.format(self.get_name()))
+            self.input.append(average_policy)
+            average_policy_distribution = tf.contrib.distributions.Categorical(probs=(average_policy + eps))
+            self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(average_policy_distribution,
+                                                                               self.policy_distribution))
+            if self.ap.algorithm.use_trust_region_optimization:
+                @tf.custom_gradient
+                def trust_region_layer(x):
+                    def grad(g):
+                        g = - g * batch_size
+                        k = - average_policy / (self.policy_probs + eps)
+                        adj = tf.nn.relu(
+                            (tf.reduce_sum(k * g, axis=1) - self.ap.algorithm.max_KL_divergence)
+                            / (tf.reduce_sum(tf.square(k), axis=1) + eps))
+                        g = g - tf.expand_dims(adj, 1) * k
+                        return - g / batch_size
+                    return tf.identity(x), grad
+                self.output = trust_region_layer(self.output)
+
+    def _build_discrete_net(self, input_layer, action_space):
+        self.num_actions = len(action_space.actions)
+        self.actions = tf.placeholder(tf.int32, [None], name='{}_actions'.format(self.get_name()))
+        self.input.append(self.actions)
+
+        policy_values = self.dense_layer(self.num_actions)(input_layer, name='fc')
+        self.policy_probs = tf.nn.softmax(policy_values, name='{}_policy'.format(self.get_name()))
+
+        # (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
+        self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
+        self.output = self.policy_probs