From 3358e04a6a6bd4a79b431f8ebc9ca72266bdea10 Mon Sep 17 00:00:00 2001
From: Thom Lane <thom.e.lane@gmail.com>
Date: Thu, 15 Nov 2018 13:27:54 -0800
Subject: [PATCH] Corrected MXNet's PPO Head for Continuous Action Spaces (#84)

* Changes required for Continuous PPO Head with MXNet. Used in MountainCarContinuous_ClippedPPO.

* Simplified changes for continuous ppo.

* Cleaned up to avoid duplicate code, and simplified covariance creation.
---
 .../mxnet_components/architecture.py          |  3 +-
 .../mxnet_components/general_network.py       |  5 ++-
 .../mxnet_components/heads/ppo_head.py        | 36 ++++++++++---------
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/rl_coach/architectures/mxnet_components/architecture.py b/rl_coach/architectures/mxnet_components/architecture.py
index dd860fb..8b39851 100644
--- a/rl_coach/architectures/mxnet_components/architecture.py
+++ b/rl_coach/architectures/mxnet_components/architecture.py
@@ -299,8 +299,7 @@ class MxnetArchitecture(Architecture):
         assert outputs is None, "outputs must be None"
 
         output = self._predict(inputs)
-
-        output = tuple(o.asnumpy() for o in output)
+        output = list(o.asnumpy() for o in output)
         if squeeze_output:
             output = squeeze_list(output)
         return output
diff --git a/rl_coach/architectures/mxnet_components/general_network.py b/rl_coach/architectures/mxnet_components/general_network.py
index bb1b176..99645fe 100644
--- a/rl_coach/architectures/mxnet_components/general_network.py
+++ b/rl_coach/architectures/mxnet_components/general_network.py
@@ -412,7 +412,10 @@ class SingleModel(HybridBlock):
         # Head
         outputs = tuple()
         for head in self._output_heads:
-            outputs += (head(state_embedding),)
+            out = head(state_embedding)
+            if not isinstance(out, tuple):
+                out = (out,)
+            outputs += out
 
         return outputs
 
diff --git a/rl_coach/architectures/mxnet_components/heads/ppo_head.py b/rl_coach/architectures/mxnet_components/heads/ppo_head.py
index 01b2192..a6e65e3 100644
--- a/rl_coach/architectures/mxnet_components/heads/ppo_head.py
+++ b/rl_coach/architectures/mxnet_components/heads/ppo_head.py
@@ -28,7 +28,7 @@ class MultivariateNormalDist:
                  sigma: nd_sym_type,
                  F: ModuleType=mx.nd) -> None:
         """
-        Distribution object for Multivariate Normal. Works with batches.
+        Distribution object for Multivariate Normal. Works with batches. 
         Optionally works with batches and time steps, but be consistent in usage: i.e. if using time_step,
         mean, sigma and data for log_prob must all include a time_step dimension.
 
@@ -264,12 +264,12 @@ class ContinuousPPOHead(nn.HybridBlock):
             # but since we assume the action probability variables are independent,
             # only the diagonal entries of the covariance matrix are specified.
             self.log_std = self.params.get('log_std',
-                                           shape=num_actions,
+                                           shape=(num_actions,),
                                            init=mx.init.Zero(),
                                            allow_deferred_init=True)
         # todo: is_local?
 
-    def hybrid_forward(self, F: ModuleType, x: nd_sym_type, log_std: nd_sym_type) -> List[nd_sym_type]:
+    def hybrid_forward(self, F: ModuleType, x: nd_sym_type, log_std: nd_sym_type) -> Tuple[nd_sym_type, nd_sym_type]:
         """
         Used for forward pass through head network.
 
@@ -282,8 +282,8 @@ class ContinuousPPOHead(nn.HybridBlock):
             of shape (batch_size, time_step, action_mean).
         """
         policy_means = self.dense(x)
-        policy_std = log_std.exp()
-        return [policy_means, policy_std]
+        policy_std = log_std.exp().expand_dims(0).broadcast_like(policy_means)
+        return policy_means, policy_std
 
 
 class ClippedPPOLossDiscrete(HeadLoss):
@@ -490,8 +490,8 @@ class ClippedPPOLossContinuous(HeadLoss):
             of shape (batch_size, num_actions) or
             of shape (batch_size, time_step, num_actions).
         :param actions: true actions taken during rollout,
-            of shape (batch_size) or
-            of shape (batch_size, time_step).
+            of shape (batch_size, num_actions) or
+            of shape (batch_size, time_step, num_actions).
         :param old_policy_means: action means for previous policy,
             of shape (batch_size, num_actions) or
             of shape (batch_size, time_step, num_actions).
@@ -500,20 +500,24 @@ class ClippedPPOLossContinuous(HeadLoss):
             of shape (batch_size, time_step, num_actions).
         :param clip_param_rescaler: scales epsilon to use for likelihood ratio clipping.
         :param advantages: change in state value after taking action (a.k.a advantage)
-            of shape (batch_size) or
+            of shape (batch_size,) or
             of shape (batch_size, time_step).
         :param kl_coefficient: loss coefficient applied kl divergence loss (also see high_kl_penalty_coefficient).
         :return: loss, of shape (batch_size).
         """
-        old_var = old_policy_stds ** 2
-        # sets diagonal in (batch size and time step) covariance matrices
-        old_covar = mx.nd.eye(N=self.num_actions) * (old_var + eps).broadcast_like(old_policy_means).expand_dims(-2)
+
+        def diagonal_covariance(stds, size):
+            vars = stds ** 2
+            # sets diagonal in (batch size and time step) covariance matrices
+            vars_tiled = vars.expand_dims(2).tile((1, 1, size))
+            covars = F.broadcast_mul(vars_tiled, F.eye(size))
+            return covars
+
+        old_covar = diagonal_covariance(stds=old_policy_stds, size=self.num_actions)
         old_policy_dist = MultivariateNormalDist(self.num_actions, old_policy_means, old_covar, F=F)
         action_probs_wrt_old_policy = old_policy_dist.log_prob(actions)
 
-        new_var = new_policy_stds ** 2
-        # sets diagonal in (batch size and time step) covariance matrices
-        new_covar = mx.nd.eye(N=self.num_actions) * (new_var + eps).broadcast_like(new_policy_means).expand_dims(-2)
+        new_covar = diagonal_covariance(stds=new_policy_stds, size=self.num_actions)
         new_policy_dist = MultivariateNormalDist(self.num_actions, new_policy_means, new_covar, F=F)
         action_probs_wrt_new_policy = new_policy_dist.log_prob(actions)
 
@@ -607,7 +611,7 @@ class PPOHead(Head):
         if isinstance(self.spaces.action, DiscreteActionSpace):
             self.net = DiscretePPOHead(num_actions=len(self.spaces.action.actions))
         elif isinstance(self.spaces.action, BoxActionSpace):
-            self.net = ContinuousPPOHead(num_actions=len(self.spaces.action.actions))
+            self.net = ContinuousPPOHead(num_actions=self.spaces.action.shape[0])
         else:
             raise ValueError("Only discrete or continuous action spaces are supported for PPO.")
 
@@ -635,7 +639,7 @@ class PPOHead(Head):
                                           self.kl_cutoff, self.high_kl_penalty_coefficient,
                                           self.loss_weight)
         elif isinstance(self.spaces.action, BoxActionSpace):
-            loss = ClippedPPOLossContinuous(len(self.spaces.action.actions),
+            loss = ClippedPPOLossContinuous(self.spaces.action.shape[0],
                                             self.clip_likelihood_ratio_using_epsilon,
                                             self.beta,
                                             self.use_kl_regularization, self.initial_kl_coefficient,