1
0
mirror of https://github.com/gryf/coach.git synced 2026-02-15 13:35:55 +01:00

ACER algorithm (#184)

* initial ACER commit

* Code cleanup + several fixes

* Q-retrace bug fix + small clean-ups

* added documentation for acer

* ACER benchmarks

* update benchmarks table

* Add nightly running of golden and trace tests. (#202)

Resolves #200

* comment out nightly trace tests until values reset.

* remove redundant observe ignore (#168)

* ensure nightly test env containers exist. (#205)

Also bump integration test timeout

* wxPython removal (#207)

Replacing wxPython with Python's Tkinter.
Also removing the option to choose multiple files as it is unused and causes errors, and fixing the load file/directory spinner.

* Create CONTRIBUTING.md (#210)

* Create CONTRIBUTING.md.  Resolves #188

* run nightly golden tests sequentially. (#217)

Should reduce resource requirements and potential CPU contention but increases
overall execution time.

* tests: added new setup configuration + test args (#211)

- added utils for future tests and conftest
- added test args

* new docs build

* golden test update
This commit is contained in:
shadiendrawis
2019-02-20 23:52:34 +02:00
committed by GitHub
parent 7253f511ed
commit 2b5d1dabe6
175 changed files with 2327 additions and 664 deletions

View File

@@ -350,7 +350,7 @@ class TensorFlowArchitecture(Architecture):
importance_weight = np.ones(target_ph.shape[0])
else:
importance_weight = importance_weights[placeholder_idx]
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
importance_weight = np.reshape(importance_weight, (-1,) + (1,) * (len(target_ph.shape) - 1))
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight

View File

@@ -11,6 +11,7 @@ from .q_head import QHead
from .quantile_regression_q_head import QuantileRegressionQHead
from .rainbow_q_head import RainbowQHead
from .v_head import VHead
from .acer_policy_head import ACERPolicyHead
__all__ = [
'CategoricalQHead',
@@ -25,5 +26,6 @@ __all__ = [
'QHead',
'QuantileRegressionQHead',
'RainbowQHead',
'VHead'
'VHead',
'ACERPolicyHead'
]

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.layers import Dense
from rl_coach.architectures.tensorflow_components.heads.head import Head
from rl_coach.base_parameters import AgentParameters
from rl_coach.core_types import ActionProbabilities
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import eps
class ACERPolicyHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
dense_layer=Dense):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'acer_policy_head'
self.return_type = ActionProbabilities
self.beta = None
self.action_penalty = None
# a scalar weight that penalizes low entropy values to encourage exploration
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
# we set the beta value as a tf variable so it can be updated later if needed
self.beta = tf.Variable(float(agent_parameters.algorithm.beta_entropy),
trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
self.beta_placeholder = tf.placeholder('float')
self.set_beta = tf.assign(self.beta, self.beta_placeholder)
def _build_module(self, input_layer):
if isinstance(self.spaces.action, DiscreteActionSpace):
# create a discrete action network (softmax probabilities output)
self._build_discrete_net(input_layer, self.spaces.action)
else:
raise ValueError("only discrete action spaces are supported for ACER")
if self.is_local:
# add entropy regularization
if self.beta:
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
# Truncated importance sampling with bias corrections
importance_sampling_weight = tf.placeholder(tf.float32, [None, self.num_actions],
name='{}_importance_sampling_weight'.format(self.get_name()))
self.input.append(importance_sampling_weight)
importance_sampling_weight_i = tf.placeholder(tf.float32, [None],
name='{}_importance_sampling_weight_i'.format(self.get_name()))
self.input.append(importance_sampling_weight_i)
V_values = tf.placeholder(tf.float32, [None], name='{}_V_values'.format(self.get_name()))
self.target.append(V_values)
Q_values = tf.placeholder(tf.float32, [None, self.num_actions], name='{}_Q_values'.format(self.get_name()))
self.input.append(Q_values)
Q_retrace = tf.placeholder(tf.float32, [None], name='{}_Q_retrace'.format(self.get_name()))
self.input.append(Q_retrace)
action_log_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
self.probability_loss = -tf.reduce_mean(action_log_probs_wrt_policy
* (Q_retrace - V_values)
* tf.minimum(self.ap.algorithm.importance_weight_truncation,
importance_sampling_weight_i))
log_probs_wrt_policy = tf.log(self.policy_probs + eps)
bias_correction_gain = tf.reduce_sum(log_probs_wrt_policy
* (Q_values - tf.expand_dims(V_values, 1))
* tf.nn.relu(1.0 - (self.ap.algorithm.importance_weight_truncation
/ (importance_sampling_weight + eps)))
* tf.stop_gradient(self.policy_probs),
axis=1)
self.bias_correction_loss = -tf.reduce_mean(bias_correction_gain)
self.loss = self.probability_loss + self.bias_correction_loss
tf.losses.add_loss(self.loss)
# Trust region
batch_size = tf.to_float(tf.shape(input_layer)[0])
average_policy = tf.placeholder(tf.float32, [None, self.num_actions],
name='{}_average_policy'.format(self.get_name()))
self.input.append(average_policy)
average_policy_distribution = tf.contrib.distributions.Categorical(probs=(average_policy + eps))
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(average_policy_distribution,
self.policy_distribution))
if self.ap.algorithm.use_trust_region_optimization:
@tf.custom_gradient
def trust_region_layer(x):
def grad(g):
g = - g * batch_size
k = - average_policy / (self.policy_probs + eps)
adj = tf.nn.relu(
(tf.reduce_sum(k * g, axis=1) - self.ap.algorithm.max_KL_divergence)
/ (tf.reduce_sum(tf.square(k), axis=1) + eps))
g = g - tf.expand_dims(adj, 1) * k
return - g / batch_size
return tf.identity(x), grad
self.output = trust_region_layer(self.output)
def _build_discrete_net(self, input_layer, action_space):
self.num_actions = len(action_space.actions)
self.actions = tf.placeholder(tf.int32, [None], name='{}_actions'.format(self.get_name()))
self.input.append(self.actions)
policy_values = self.dense_layer(self.num_actions)(input_layer, name='fc')
self.policy_probs = tf.nn.softmax(policy_values, name='{}_policy'.format(self.get_name()))
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
self.output = self.policy_probs