1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 03:30:19 +01:00

Corrected MXNet's PPO Head for Continuous Action Spaces (#84)

* Changes required for Continuous PPO Head with MXNet. Used in MountainCarContinuous_ClippedPPO.

* Simplified changes for continuous ppo.

* Cleaned up to avoid duplicate code, and simplified covariance creation.
This commit is contained in:
Thom Lane
2018-11-15 13:27:54 -08:00
committed by Scott Leishman
parent fde73ced13
commit 3358e04a6a
3 changed files with 25 additions and 19 deletions

View File

@@ -299,8 +299,7 @@ class MxnetArchitecture(Architecture):
assert outputs is None, "outputs must be None"
output = self._predict(inputs)
output = tuple(o.asnumpy() for o in output)
output = list(o.asnumpy() for o in output)
if squeeze_output:
output = squeeze_list(output)
return output

View File

@@ -412,7 +412,10 @@ class SingleModel(HybridBlock):
# Head
outputs = tuple()
for head in self._output_heads:
outputs += (head(state_embedding),)
out = head(state_embedding)
if not isinstance(out, tuple):
out = (out,)
outputs += out
return outputs

View File

@@ -28,7 +28,7 @@ class MultivariateNormalDist:
sigma: nd_sym_type,
F: ModuleType=mx.nd) -> None:
"""
Distribution object for Multivariate Normal. Works with batches.
Distribution object for Multivariate Normal. Works with batches.
Optionally works with batches and time steps, but be consistent in usage: i.e. if using time_step,
mean, sigma and data for log_prob must all include a time_step dimension.
@@ -264,12 +264,12 @@ class ContinuousPPOHead(nn.HybridBlock):
# but since we assume the action probability variables are independent,
# only the diagonal entries of the covariance matrix are specified.
self.log_std = self.params.get('log_std',
shape=num_actions,
shape=(num_actions,),
init=mx.init.Zero(),
allow_deferred_init=True)
# todo: is_local?
def hybrid_forward(self, F: ModuleType, x: nd_sym_type, log_std: nd_sym_type) -> List[nd_sym_type]:
def hybrid_forward(self, F: ModuleType, x: nd_sym_type, log_std: nd_sym_type) -> Tuple[nd_sym_type, nd_sym_type]:
"""
Used for forward pass through head network.
@@ -282,8 +282,8 @@ class ContinuousPPOHead(nn.HybridBlock):
of shape (batch_size, time_step, action_mean).
"""
policy_means = self.dense(x)
policy_std = log_std.exp()
return [policy_means, policy_std]
policy_std = log_std.exp().expand_dims(0).broadcast_like(policy_means)
return policy_means, policy_std
class ClippedPPOLossDiscrete(HeadLoss):
@@ -490,8 +490,8 @@ class ClippedPPOLossContinuous(HeadLoss):
of shape (batch_size, num_actions) or
of shape (batch_size, time_step, num_actions).
:param actions: true actions taken during rollout,
of shape (batch_size) or
of shape (batch_size, time_step).
of shape (batch_size, num_actions) or
of shape (batch_size, time_step, num_actions).
:param old_policy_means: action means for previous policy,
of shape (batch_size, num_actions) or
of shape (batch_size, time_step, num_actions).
@@ -500,20 +500,24 @@ class ClippedPPOLossContinuous(HeadLoss):
of shape (batch_size, time_step, num_actions).
:param clip_param_rescaler: scales epsilon to use for likelihood ratio clipping.
:param advantages: change in state value after taking action (a.k.a advantage)
of shape (batch_size) or
of shape (batch_size,) or
of shape (batch_size, time_step).
:param kl_coefficient: loss coefficient applied kl divergence loss (also see high_kl_penalty_coefficient).
:return: loss, of shape (batch_size).
"""
old_var = old_policy_stds ** 2
# sets diagonal in (batch size and time step) covariance matrices
old_covar = mx.nd.eye(N=self.num_actions) * (old_var + eps).broadcast_like(old_policy_means).expand_dims(-2)
def diagonal_covariance(stds, size):
vars = stds ** 2
# sets diagonal in (batch size and time step) covariance matrices
vars_tiled = vars.expand_dims(2).tile((1, 1, size))
covars = F.broadcast_mul(vars_tiled, F.eye(size))
return covars
old_covar = diagonal_covariance(stds=old_policy_stds, size=self.num_actions)
old_policy_dist = MultivariateNormalDist(self.num_actions, old_policy_means, old_covar, F=F)
action_probs_wrt_old_policy = old_policy_dist.log_prob(actions)
new_var = new_policy_stds ** 2
# sets diagonal in (batch size and time step) covariance matrices
new_covar = mx.nd.eye(N=self.num_actions) * (new_var + eps).broadcast_like(new_policy_means).expand_dims(-2)
new_covar = diagonal_covariance(stds=new_policy_stds, size=self.num_actions)
new_policy_dist = MultivariateNormalDist(self.num_actions, new_policy_means, new_covar, F=F)
action_probs_wrt_new_policy = new_policy_dist.log_prob(actions)
@@ -607,7 +611,7 @@ class PPOHead(Head):
if isinstance(self.spaces.action, DiscreteActionSpace):
self.net = DiscretePPOHead(num_actions=len(self.spaces.action.actions))
elif isinstance(self.spaces.action, BoxActionSpace):
self.net = ContinuousPPOHead(num_actions=len(self.spaces.action.actions))
self.net = ContinuousPPOHead(num_actions=self.spaces.action.shape[0])
else:
raise ValueError("Only discrete or continuous action spaces are supported for PPO.")
@@ -635,7 +639,7 @@ class PPOHead(Head):
self.kl_cutoff, self.high_kl_penalty_coefficient,
self.loss_weight)
elif isinstance(self.spaces.action, BoxActionSpace):
loss = ClippedPPOLossContinuous(len(self.spaces.action.actions),
loss = ClippedPPOLossContinuous(self.spaces.action.shape[0],
self.clip_likelihood_ratio_using_epsilon,
self.beta,
self.use_kl_regularization, self.initial_kl_coefficient,