From 3358e04a6a6bd4a79b431f8ebc9ca72266bdea10 Mon Sep 17 00:00:00 2001 From: Thom Lane Date: Thu, 15 Nov 2018 13:27:54 -0800 Subject: [PATCH] Corrected MXNet's PPO Head for Continuous Action Spaces (#84) * Changes required for Continuous PPO Head with MXNet. Used in MountainCarContinuous_ClippedPPO. * Simplified changes for continuous ppo. * Cleaned up to avoid duplicate code, and simplified covariance creation. --- .../mxnet_components/architecture.py | 3 +- .../mxnet_components/general_network.py | 5 ++- .../mxnet_components/heads/ppo_head.py | 36 ++++++++++--------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/rl_coach/architectures/mxnet_components/architecture.py b/rl_coach/architectures/mxnet_components/architecture.py index dd860fb..8b39851 100644 --- a/rl_coach/architectures/mxnet_components/architecture.py +++ b/rl_coach/architectures/mxnet_components/architecture.py @@ -299,8 +299,7 @@ class MxnetArchitecture(Architecture): assert outputs is None, "outputs must be None" output = self._predict(inputs) - - output = tuple(o.asnumpy() for o in output) + output = list(o.asnumpy() for o in output) if squeeze_output: output = squeeze_list(output) return output diff --git a/rl_coach/architectures/mxnet_components/general_network.py b/rl_coach/architectures/mxnet_components/general_network.py index bb1b176..99645fe 100644 --- a/rl_coach/architectures/mxnet_components/general_network.py +++ b/rl_coach/architectures/mxnet_components/general_network.py @@ -412,7 +412,10 @@ class SingleModel(HybridBlock): # Head outputs = tuple() for head in self._output_heads: - outputs += (head(state_embedding),) + out = head(state_embedding) + if not isinstance(out, tuple): + out = (out,) + outputs += out return outputs diff --git a/rl_coach/architectures/mxnet_components/heads/ppo_head.py b/rl_coach/architectures/mxnet_components/heads/ppo_head.py index 01b2192..a6e65e3 100644 --- a/rl_coach/architectures/mxnet_components/heads/ppo_head.py +++ b/rl_coach/architectures/mxnet_components/heads/ppo_head.py @@ -28,7 +28,7 @@ class MultivariateNormalDist: sigma: nd_sym_type, F: ModuleType=mx.nd) -> None: """ - Distribution object for Multivariate Normal. Works with batches. + Distribution object for Multivariate Normal. Works with batches. Optionally works with batches and time steps, but be consistent in usage: i.e. if using time_step, mean, sigma and data for log_prob must all include a time_step dimension. @@ -264,12 +264,12 @@ class ContinuousPPOHead(nn.HybridBlock): # but since we assume the action probability variables are independent, # only the diagonal entries of the covariance matrix are specified. self.log_std = self.params.get('log_std', - shape=num_actions, + shape=(num_actions,), init=mx.init.Zero(), allow_deferred_init=True) # todo: is_local? - def hybrid_forward(self, F: ModuleType, x: nd_sym_type, log_std: nd_sym_type) -> List[nd_sym_type]: + def hybrid_forward(self, F: ModuleType, x: nd_sym_type, log_std: nd_sym_type) -> Tuple[nd_sym_type, nd_sym_type]: """ Used for forward pass through head network. @@ -282,8 +282,8 @@ class ContinuousPPOHead(nn.HybridBlock): of shape (batch_size, time_step, action_mean). """ policy_means = self.dense(x) - policy_std = log_std.exp() - return [policy_means, policy_std] + policy_std = log_std.exp().expand_dims(0).broadcast_like(policy_means) + return policy_means, policy_std class ClippedPPOLossDiscrete(HeadLoss): @@ -490,8 +490,8 @@ class ClippedPPOLossContinuous(HeadLoss): of shape (batch_size, num_actions) or of shape (batch_size, time_step, num_actions). :param actions: true actions taken during rollout, - of shape (batch_size) or - of shape (batch_size, time_step). + of shape (batch_size, num_actions) or + of shape (batch_size, time_step, num_actions). :param old_policy_means: action means for previous policy, of shape (batch_size, num_actions) or of shape (batch_size, time_step, num_actions). @@ -500,20 +500,24 @@ class ClippedPPOLossContinuous(HeadLoss): of shape (batch_size, time_step, num_actions). :param clip_param_rescaler: scales epsilon to use for likelihood ratio clipping. :param advantages: change in state value after taking action (a.k.a advantage) - of shape (batch_size) or + of shape (batch_size,) or of shape (batch_size, time_step). :param kl_coefficient: loss coefficient applied kl divergence loss (also see high_kl_penalty_coefficient). :return: loss, of shape (batch_size). """ - old_var = old_policy_stds ** 2 - # sets diagonal in (batch size and time step) covariance matrices - old_covar = mx.nd.eye(N=self.num_actions) * (old_var + eps).broadcast_like(old_policy_means).expand_dims(-2) + + def diagonal_covariance(stds, size): + vars = stds ** 2 + # sets diagonal in (batch size and time step) covariance matrices + vars_tiled = vars.expand_dims(2).tile((1, 1, size)) + covars = F.broadcast_mul(vars_tiled, F.eye(size)) + return covars + + old_covar = diagonal_covariance(stds=old_policy_stds, size=self.num_actions) old_policy_dist = MultivariateNormalDist(self.num_actions, old_policy_means, old_covar, F=F) action_probs_wrt_old_policy = old_policy_dist.log_prob(actions) - new_var = new_policy_stds ** 2 - # sets diagonal in (batch size and time step) covariance matrices - new_covar = mx.nd.eye(N=self.num_actions) * (new_var + eps).broadcast_like(new_policy_means).expand_dims(-2) + new_covar = diagonal_covariance(stds=new_policy_stds, size=self.num_actions) new_policy_dist = MultivariateNormalDist(self.num_actions, new_policy_means, new_covar, F=F) action_probs_wrt_new_policy = new_policy_dist.log_prob(actions) @@ -607,7 +611,7 @@ class PPOHead(Head): if isinstance(self.spaces.action, DiscreteActionSpace): self.net = DiscretePPOHead(num_actions=len(self.spaces.action.actions)) elif isinstance(self.spaces.action, BoxActionSpace): - self.net = ContinuousPPOHead(num_actions=len(self.spaces.action.actions)) + self.net = ContinuousPPOHead(num_actions=self.spaces.action.shape[0]) else: raise ValueError("Only discrete or continuous action spaces are supported for PPO.") @@ -635,7 +639,7 @@ class PPOHead(Head): self.kl_cutoff, self.high_kl_penalty_coefficient, self.loss_weight) elif isinstance(self.spaces.action, BoxActionSpace): - loss = ClippedPPOLossContinuous(len(self.spaces.action.actions), + loss = ClippedPPOLossContinuous(self.spaces.action.shape[0], self.clip_likelihood_ratio_using_epsilon, self.beta, self.use_kl_regularization, self.initial_kl_coefficient,