ACER algorithm (#184)

* initial ACER commit * Code cleanup + several fixes * Q-retrace bug fix + small clean-ups * added documentation for acer * ACER benchmarks * update benchmarks table * Add nightly running of golden and trace tests. (#202) Resolves #200 * comment out nightly trace tests until values reset. * remove redundant observe ignore (#168) * ensure nightly test env containers exist. (#205) Also bump integration test timeout * wxPython removal (#207) Replacing wxPython with Python's Tkinter. Also removing the option to choose multiple files as it is unused and causes errors, and fixing the load file/directory spinner. * Create CONTRIBUTING.md (#210) * Create CONTRIBUTING.md. Resolves #188 * run nightly golden tests sequentially. (#217) Should reduce resource requirements and potential CPU contention but increases overall execution time. * tests: added new setup configuration + test args (#211) - added utils for future tests and conftest - added test args * new docs build * golden test update
2026-02-14 12:55:51 +01:00 · 2019-02-20 23:52:34 +02:00
parent 7253f511ed
commit 2b5d1dabe6
175 changed files with 2327 additions and 664 deletions
--- a/rl_coach/agents/acer_agent.py
+++ b/rl_coach/agents/acer_agent.py
@@ -0,0 +1,198 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+import numpy as np
+
+from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
+from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
+from rl_coach.architectures.head_parameters import ACERPolicyHeadParameters, QHeadParameters
+from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters
+from rl_coach.core_types import Batch
+from rl_coach.exploration_policies.categorical import CategoricalParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.spaces import DiscreteActionSpace
+from rl_coach.utils import eps, last_sample
+
+
+class ACERAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_steps_between_gradient_updates: (int)
+        Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
+        accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
+
+    :param ratio_of_replay: (int)
+        The number of off-policy training iterations in each ACER iteration.
+
+    :param num_transitions_to_start_replay: (int)
+        Number of environment steps until ACER starts to train off-policy from the experience replay.
+        This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
+        the experience replay to start the off-policy training.
+
+    :param rate_for_copying_weights_to_target: (float)
+        The rate of the exponential moving average for the average policy which is used for the trust region optimization.
+        The target network in this algorithm is used as the average policy.
+
+    :param importance_weight_truncation: (float)
+        The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).
+
+    :param use_trust_region_optimization: (bool)
+        If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
+        the average policy and the current one, to bound the change of the policy during the network update.
+
+    :param max_KL_divergence: (float)
+        The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
+        for this parameter to have an effect.
+
+    :param beta_entropy: (float)
+        An entropy regulaization term can be added to the loss function in order to control exploration. This term
+        is weighted using the beta value defined by beta_entropy.
+    """
+    def __init__(self):
+        super().__init__()
+        self.apply_gradients_every_x_episodes = 5
+        self.num_steps_between_gradient_updates = 5000
+        self.ratio_of_replay = 4
+        self.num_transitions_to_start_replay = 10000
+        self.rate_for_copying_weights_to_target = 0.99
+        self.importance_weight_truncation = 10.0
+        self.use_trust_region_optimization = True
+        self.max_KL_divergence = 1.0
+        self.beta_entropy = 0
+
+
+class ACERNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [QHeadParameters(loss_weight=0.5), ACERPolicyHeadParameters(loss_weight=1.0)]
+        self.optimizer_type = 'Adam'
+        self.async_training = True
+        self.clip_gradients = 40.0
+        self.create_target_network = True
+
+
+class ACERAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=ACERAlgorithmParameters(),
+                         exploration={DiscreteActionSpace: CategoricalParameters()},
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"main": ACERNetworkParameters()})
+    @property
+    def path(self):
+        return 'rl_coach.agents.acer_agent:ACERAgent'
+
+
+# Actor-Critic with Experience Replay - https://arxiv.org/abs/1611.01224
+class ACERAgent(PolicyOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        # signals definition
+        self.q_loss = self.register_signal('Q Loss')
+        self.policy_loss = self.register_signal('Policy Loss')
+        self.probability_loss = self.register_signal('Probability Loss')
+        self.bias_correction_loss = self.register_signal('Bias Correction Loss')
+        self.unclipped_grads = self.register_signal('Grads (unclipped)')
+        self.V_Values = self.register_signal('Values')
+        self.kl_divergence = self.register_signal('KL Divergence')
+
+    def _learn_from_batch(self, batch):
+
+        fetches = [self.networks['main'].online_network.output_heads[1].probability_loss,
+                   self.networks['main'].online_network.output_heads[1].bias_correction_loss,
+                   self.networks['main'].online_network.output_heads[1].kl_divergence]
+
+        # batch contains a list of transitions to learn from
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # get the values for the current states
+        Q_values, policy_prob = self.networks['main'].online_network.predict(batch.states(network_keys))
+        avg_policy_prob = self.networks['main'].target_network.predict(batch.states(network_keys))[1]
+        current_state_values = np.sum(policy_prob * Q_values, axis=1)
+
+        actions = batch.actions()
+        num_transitions = batch.size
+        Q_head_targets = Q_values
+
+        Q_i = Q_values[np.arange(num_transitions), actions]
+
+        mu = batch.info('all_action_probabilities')
+        rho = policy_prob / (mu + eps)
+        rho_i = rho[np.arange(batch.size), actions]
+
+        rho_bar = np.minimum(1.0, rho_i)
+
+        if batch.game_overs()[-1]:
+            Qret = 0
+        else:
+            result = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))
+            Qret = np.sum(result[0] * result[1], axis=1)[0]
+
+        for i in reversed(range(num_transitions)):
+            Qret = batch.rewards()[i] + self.ap.algorithm.discount * Qret
+            Q_head_targets[i, actions[i]] = Qret
+            Qret = rho_bar[i] * (Qret - Q_i[i]) + current_state_values[i]
+
+        Q_retrace = Q_head_targets[np.arange(num_transitions), actions]
+
+        # train
+        result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
+                                                                'output_1_0': actions,
+                                                                'output_1_1': rho,
+                                                                'output_1_2': rho_i,
+                                                                'output_1_3': Q_values,
+                                                                'output_1_4': Q_retrace,
+                                                                'output_1_5': avg_policy_prob},
+                                                               [Q_head_targets, current_state_values],
+                                                               additional_fetches=fetches)
+
+        for network in self.networks.values():
+            network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
+
+        # logging
+        total_loss, losses, unclipped_grads, fetch_result = result[:4]
+        self.q_loss.add_sample(losses[0])
+        self.policy_loss.add_sample(losses[1])
+        self.probability_loss.add_sample(fetch_result[0])
+        self.bias_correction_loss.add_sample(fetch_result[1])
+        self.unclipped_grads.add_sample(unclipped_grads)
+        self.V_Values.add_sample(current_state_values)
+        self.kl_divergence.add_sample(fetch_result[2])
+
+        return total_loss, losses, unclipped_grads
+
+    def learn_from_batch(self, batch):
+        # perform on-policy training iteration
+        total_loss, losses, unclipped_grads = self._learn_from_batch(batch)
+
+        if self.ap.algorithm.ratio_of_replay > 0 \
+                and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay:
+            n = np.random.poisson(self.ap.algorithm.ratio_of_replay)
+            # perform n off-policy training iterations
+            for _ in range(n):
+                new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True)))
+                result = self._learn_from_batch(new_batch)
+                total_loss += result[0]
+                losses += result[1]
+                unclipped_grads += result[2]
+
+        return total_loss, losses, unclipped_grads
+
+    def get_prediction(self, states):
+        tf_input_state = self.prepare_batch_for_inference(states, "main")
+        return self.networks['main'].online_network.predict(tf_input_state)[1:]  # index 0 is the state value
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -1026,7 +1026,7 @@ class Agent(AgentInterface):
        """
        Collect all of agent's network savers
        :param parent_path_suffix: path suffix of the parent of the agent
-            (could be name of level manager or composite agent)
+        (could be name of level manager or composite agent)
        :return: collection of all agent savers
        """
        parent_path_suffix = "{}.{}".format(parent_path_suffix, self.name)
--- a/rl_coach/agents/policy_optimization_agent.py
+++ b/rl_coach/agents/policy_optimization_agent.py
@@ -149,7 +149,7 @@ class PolicyOptimizationAgent(Agent):
            action_probabilities = np.array(action_values).squeeze()
            action = self.exploration_policy.get_action(action_probabilities)
            action_info = ActionInfo(action=action,
-                                     action_probability=action_probabilities[action])
+                                     all_action_probabilities=action_probabilities)

            self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
        elif isinstance(self.spaces.action, BoxActionSpace):
--- a/rl_coach/architectures/head_parameters.py
+++ b/rl_coach/architectures/head_parameters.py
@@ -176,3 +176,13 @@ class RainbowQHeadParameters(HeadParameters):
                         dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
                         rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
                         loss_weight=loss_weight)
+
+
+class ACERPolicyHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='acer_policy_head_params',
+                 num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
+                 loss_weight: float = 1.0, dense_layer=None):
+        super().__init__(parameterized_class_name="ACERPolicyHead", activation_function=activation_function, name=name,
+                         dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
+                         rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
+                         loss_weight=loss_weight)
--- a/rl_coach/architectures/tensorflow_components/architecture.py
+++ b/rl_coach/architectures/tensorflow_components/architecture.py
@@ -350,7 +350,7 @@ class TensorFlowArchitecture(Architecture):
                importance_weight = np.ones(target_ph.shape[0])
            else:
                importance_weight = importance_weights[placeholder_idx]
-            importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
+            importance_weight = np.reshape(importance_weight, (-1,) + (1,) * (len(target_ph.shape) - 1))

            feed_dict[self.importance_weights[placeholder_idx]] = importance_weight

--- a/rl_coach/architectures/tensorflow_components/heads/init.py
+++ b/rl_coach/architectures/tensorflow_components/heads/init.py
@@ -11,6 +11,7 @@ from .q_head import QHead
 from .quantile_regression_q_head import QuantileRegressionQHead
 from .rainbow_q_head import RainbowQHead
 from .v_head import VHead
+from .acer_policy_head import ACERPolicyHead

 __all__ = [
    'CategoricalQHead',
@@ -25,5 +26,6 @@ __all__ = [
    'QHead',
    'QuantileRegressionQHead',
    'RainbowQHead',
-    'VHead'
+    'VHead',
+    'ACERPolicyHead'
 ]
--- a/rl_coach/architectures/tensorflow_components/heads/acer_policy_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/acer_policy_head.py
@@ -0,0 +1,126 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+
+from rl_coach.architectures.tensorflow_components.layers import Dense
+from rl_coach.architectures.tensorflow_components.heads.head import Head
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.core_types import ActionProbabilities
+from rl_coach.spaces import DiscreteActionSpace
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+
+class ACERPolicyHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
+                 dense_layer=Dense):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
+                         dense_layer=dense_layer)
+        self.name = 'acer_policy_head'
+        self.return_type = ActionProbabilities
+        self.beta = None
+        self.action_penalty = None
+
+        # a scalar weight that penalizes low entropy values to encourage exploration
+        if hasattr(agent_parameters.algorithm, 'beta_entropy'):
+            # we set the beta value as a tf variable so it can be updated later if needed
+            self.beta = tf.Variable(float(agent_parameters.algorithm.beta_entropy),
+                                    trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
+            self.beta_placeholder = tf.placeholder('float')
+            self.set_beta = tf.assign(self.beta, self.beta_placeholder)
+
+    def _build_module(self, input_layer):
+        if isinstance(self.spaces.action, DiscreteActionSpace):
+            # create a discrete action network (softmax probabilities output)
+            self._build_discrete_net(input_layer, self.spaces.action)
+        else:
+            raise ValueError("only discrete action spaces are supported for ACER")
+
+        if self.is_local:
+            # add entropy regularization
+            if self.beta:
+                self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
+                self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
+            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+            # Truncated importance sampling with bias corrections
+            importance_sampling_weight = tf.placeholder(tf.float32, [None, self.num_actions],
+                                                        name='{}_importance_sampling_weight'.format(self.get_name()))
+            self.input.append(importance_sampling_weight)
+            importance_sampling_weight_i = tf.placeholder(tf.float32, [None],
+                                                          name='{}_importance_sampling_weight_i'.format(self.get_name()))
+            self.input.append(importance_sampling_weight_i)
+
+            V_values = tf.placeholder(tf.float32, [None], name='{}_V_values'.format(self.get_name()))
+            self.target.append(V_values)
+            Q_values = tf.placeholder(tf.float32, [None, self.num_actions], name='{}_Q_values'.format(self.get_name()))
+            self.input.append(Q_values)
+            Q_retrace = tf.placeholder(tf.float32, [None], name='{}_Q_retrace'.format(self.get_name()))
+            self.input.append(Q_retrace)
+
+            action_log_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
+            self.probability_loss = -tf.reduce_mean(action_log_probs_wrt_policy
+                                                    * (Q_retrace - V_values)
+                                                    * tf.minimum(self.ap.algorithm.importance_weight_truncation,
+                                                                 importance_sampling_weight_i))
+
+            log_probs_wrt_policy = tf.log(self.policy_probs + eps)
+            bias_correction_gain = tf.reduce_sum(log_probs_wrt_policy
+                                                 * (Q_values - tf.expand_dims(V_values, 1))
+                                                 * tf.nn.relu(1.0 - (self.ap.algorithm.importance_weight_truncation
+                                                                     / (importance_sampling_weight + eps)))
+                                                 * tf.stop_gradient(self.policy_probs),
+                                                 axis=1)
+            self.bias_correction_loss = -tf.reduce_mean(bias_correction_gain)
+
+            self.loss = self.probability_loss + self.bias_correction_loss
+            tf.losses.add_loss(self.loss)
+
+            # Trust region
+            batch_size = tf.to_float(tf.shape(input_layer)[0])
+            average_policy = tf.placeholder(tf.float32, [None, self.num_actions],
+                                            name='{}_average_policy'.format(self.get_name()))
+            self.input.append(average_policy)
+            average_policy_distribution = tf.contrib.distributions.Categorical(probs=(average_policy + eps))
+            self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(average_policy_distribution,
+                                                                               self.policy_distribution))
+            if self.ap.algorithm.use_trust_region_optimization:
+                @tf.custom_gradient
+                def trust_region_layer(x):
+                    def grad(g):
+                        g = - g * batch_size
+                        k = - average_policy / (self.policy_probs + eps)
+                        adj = tf.nn.relu(
+                            (tf.reduce_sum(k * g, axis=1) - self.ap.algorithm.max_KL_divergence)
+                            / (tf.reduce_sum(tf.square(k), axis=1) + eps))
+                        g = g - tf.expand_dims(adj, 1) * k
+                        return - g / batch_size
+                    return tf.identity(x), grad
+                self.output = trust_region_layer(self.output)
+
+    def _build_discrete_net(self, input_layer, action_space):
+        self.num_actions = len(action_space.actions)
+        self.actions = tf.placeholder(tf.int32, [None], name='{}_actions'.format(self.get_name()))
+        self.input.append(self.actions)
+
+        policy_values = self.dense_layer(self.num_actions)(input_layer, name='fc')
+        self.policy_probs = tf.nn.softmax(policy_values, name='{}_policy'.format(self.get_name()))
+
+        # (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
+        self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
+        self.output = self.policy_probs
--- a/rl_coach/core_types.py
+++ b/rl_coach/core_types.py
@@ -329,12 +329,12 @@ class ActionInfo(object):
    Action info is a class that holds an action and various additional information details about it
    """

-    def __init__(self, action: ActionType, action_probability: float=0,
+    def __init__(self, action: ActionType, all_action_probabilities: float=0,
                 action_value: float=0., state_value: float=0., max_action_value: float=None,
                 action_intrinsic_reward: float=0):
        """
        :param action: the action
-        :param action_probability: the probability that the action was given when selecting it
+        :param all_action_probabilities: the probability that the action was given when selecting it
        :param action_value: the state-action value (Q value) of the action
        :param state_value: the state value (V value) of the state where the action was taken
        :param max_action_value: in case this is an action that was selected randomly, this is the value of the action
@@ -344,7 +344,7 @@ class ActionInfo(object):
                                        selection
        """
        self.action = action
-        self.action_probability = action_probability
+        self.all_action_probabilities = all_action_probabilities
        self.action_value = action_value
        self.state_value = state_value
        if not max_action_value:
--- a/rl_coach/memories/episodic/episodic_experience_replay.py
+++ b/rl_coach/memories/episodic/episodic_experience_replay.py
@@ -75,18 +75,27 @@ class EpisodicExperienceReplay(Memory):
    def num_transitions_in_complete_episodes(self):
        return self._num_transitions_in_complete_episodes

-    def sample(self, size: int) -> List[Transition]:
+    def sample(self, size: int, is_consecutive_transitions=False) -> List[Transition]:
        """
-        Sample a batch of transitions form the replay buffer. If the requested size is larger than the number
+        Sample a batch of transitions from the replay buffer. If the requested size is larger than the number
        of samples available in the replay buffer then the batch will return empty.
        :param size: the size of the batch to sample
+        :param is_consecutive_transitions: if set True, samples a batch of consecutive transitions.
        :return: a batch (list) of selected transitions from the replay buffer
        """
        self.reader_writer_lock.lock_writing()

        if self.num_complete_episodes() >= 1:
-            transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
-            batch = [self.transitions[i] for i in transitions_idx]
+            if is_consecutive_transitions:
+                episode_idx = np.random.randint(0, self.num_complete_episodes())
+                if self._buffer[episode_idx].length() <= size:
+                    batch = self._buffer[episode_idx].transitions
+                else:
+                    transition_idx = np.random.randint(size, self._buffer[episode_idx].length())
+                    batch = self._buffer[episode_idx].transitions[transition_idx-size:transition_idx]
+            else:
+                transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
+                batch = [self.transitions[i] for i in transitions_idx]

        else:
            raise ValueError("The episodic replay buffer cannot be sampled since there are no complete episodes yet. "
--- a/rl_coach/presets/Atari_ACER.py
+++ b/rl_coach/presets/Atari_ACER.py
@@ -0,0 +1,45 @@
+from rl_coach.agents.acer_agent import ACERAgentParameters
+from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
+from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
+from rl_coach.environments.environment import SingleLevelSelection
+from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
+from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
+from rl_coach.graph_managers.graph_manager import ScheduleParameters
+from rl_coach.memories.memory import MemoryGranularity
+
+####################
+# Graph Scheduling #
+####################
+schedule_params = ScheduleParameters()
+schedule_params.improve_steps = TrainingSteps(10000000000)
+schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
+schedule_params.evaluation_steps = EnvironmentEpisodes(3)
+schedule_params.heatup_steps = EnvironmentSteps(0)
+
+#########
+# Agent #
+#########
+agent_params = ACERAgentParameters()
+
+agent_params.algorithm.apply_gradients_every_x_episodes = 1
+agent_params.algorithm.num_steps_between_gradient_updates = 20
+agent_params.algorithm.ratio_of_replay = 4
+agent_params.algorithm.num_transitions_to_start_replay = 10000
+agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)
+agent_params.network_wrappers['main'].learning_rate = 0.0001
+agent_params.algorithm.beta_entropy = 0.05
+
+###############
+# Environment #
+###############
+env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
+
+########
+# Test #
+########
+preset_validation_params = PresetValidationParameters()
+preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
+
+graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
+                                    schedule_params=schedule_params, vis_params=VisualizationParameters(),
+                                    preset_validation_params=preset_validation_params)
--- a/rl_coach/presets/CartPole_ACER.py
+++ b/rl_coach/presets/CartPole_ACER.py
@@ -0,0 +1,49 @@
+from rl_coach.agents.acer_agent import ACERAgentParameters
+from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
+from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
+from rl_coach.environments.gym_environment import GymVectorEnvironment
+from rl_coach.filters.filter import InputFilter
+from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
+from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
+from rl_coach.graph_managers.graph_manager import ScheduleParameters
+from rl_coach.memories.memory import MemoryGranularity
+
+####################
+# Graph Scheduling #
+####################
+schedule_params = ScheduleParameters()
+schedule_params.improve_steps = TrainingSteps(10000000000)
+schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
+schedule_params.evaluation_steps = EnvironmentEpisodes(1)
+schedule_params.heatup_steps = EnvironmentSteps(0)
+
+#########
+# Agent #
+#########
+agent_params = ACERAgentParameters()
+
+agent_params.algorithm.num_steps_between_gradient_updates = 5
+agent_params.algorithm.ratio_of_replay = 4
+agent_params.algorithm.num_transitions_to_start_replay = 1000
+agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)
+agent_params.input_filter = InputFilter()
+agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
+agent_params.algorithm.beta_entropy = 0.0
+
+###############
+# Environment #
+###############
+env_params = GymVectorEnvironment(level='CartPole-v0')
+
+########
+# Test #
+########
+preset_validation_params = PresetValidationParameters()
+preset_validation_params.test = True
+preset_validation_params.min_reward_threshold = 150
+preset_validation_params.max_episodes_to_achieve_reward = 300
+preset_validation_params.num_workers = 1
+
+graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
+                                    schedule_params=schedule_params, vis_params=VisualizationParameters(),
+                                    preset_validation_params=preset_validation_params)
--- a/rl_coach/presets/Doom_Basic_ACER.py
+++ b/rl_coach/presets/Doom_Basic_ACER.py
@@ -0,0 +1,55 @@
+from rl_coach.agents.acer_agent import ACERAgentParameters
+from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
+from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
+from rl_coach.environments.doom_environment import DoomEnvironmentParameters
+from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
+from rl_coach.graph_managers.graph_manager import ScheduleParameters
+from rl_coach.memories.memory import MemoryGranularity
+from rl_coach.filters.filter import InputFilter
+from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
+
+####################
+# Graph Scheduling #
+####################
+
+schedule_params = ScheduleParameters()
+schedule_params.improve_steps = TrainingSteps(10000000000)
+schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
+schedule_params.evaluation_steps = EnvironmentEpisodes(1)
+schedule_params.heatup_steps = EnvironmentSteps(0)
+
+
+#########
+# Agent #
+#########
+agent_params = ACERAgentParameters()
+
+agent_params.algorithm.num_steps_between_gradient_updates = 30
+agent_params.algorithm.apply_gradients_every_x_episodes = 1
+agent_params.network_wrappers['main'].learning_rate = 0.0001
+agent_params.algorithm.ratio_of_replay = 4
+agent_params.algorithm.num_transitions_to_start_replay = 2000
+agent_params.memory.max_size = (MemoryGranularity.Transitions, 100000)
+agent_params.input_filter = InputFilter()
+agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/100.))
+agent_params.algorithm.beta_entropy = 0.01
+agent_params.network_wrappers['main'].clip_gradients = 40.
+
+###############
+# Environment #
+###############
+env_params = DoomEnvironmentParameters(level='basic')
+
+########
+# Test #
+########
+preset_validation_params = PresetValidationParameters()
+preset_validation_params.test = True
+preset_validation_params.min_reward_threshold = 20
+preset_validation_params.max_episodes_to_achieve_reward = 400
+preset_validation_params.num_workers = 8
+
+
+graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
+                                    schedule_params=schedule_params, vis_params=VisualizationParameters(),
+                                    preset_validation_params=preset_validation_params)
--- a/rl_coach/spaces.py
+++ b/rl_coach/spaces.py
@@ -403,7 +403,8 @@ class DiscreteActionSpace(ActionSpace):
        return np.random.choice(self.actions)

    def sample_with_info(self) -> ActionInfo:
-        return ActionInfo(self.sample(), action_probability=1. / (self.high[0] - self.low[0] + 1))
+        return ActionInfo(self.sample(),
+                          all_action_probabilities=np.full(len(self.actions), 1. / (self.high[0] - self.low[0] + 1)))

    def get_description(self, action: int) -> str:
        if type(self.descriptions) == list and 0 <= action < len(self.descriptions):
@@ -450,7 +451,7 @@ class MultiSelectActionSpace(ActionSpace):
        return random.choice(self.actions)

    def sample_with_info(self) -> ActionInfo:
-        return ActionInfo(self.sample(), action_probability=1. / len(self.actions))
+        return ActionInfo(self.sample(), all_action_probabilities=np.full(len(self.actions), 1. / len(self.actions)))

    def get_description(self, action: np.ndarray) -> str:
        if np.sum(len(np.where(action == 0)[0])) + np.sum(len(np.where(action == 1)[0])) != self.shape or \
--- a/rl_coach/tests/test_spaces.py
+++ b/rl_coach/tests/test_spaces.py
@@ -14,7 +14,7 @@ def test_discrete():
    for i in range(100):
        assert 3 > action_space.sample() >= 0
    action_info = action_space.sample_with_info()
-    assert action_info.action_probability == 1. / 3
+    assert action_info.all_action_probabilities[0] == 1. / 3
    assert action_space.high == 2
    assert action_space.low == 0