pre-release 0.10.0

2025-12-18 03:30:19 +01:00 · 2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions
--- a/rl_coach/init.py
+++ b/rl_coach/init.py
--- a/rl_coach/agents/init.py
+++ b/rl_coach/agents/init.py
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/rl_coach/agents/actor_critic_agent.py
+++ b/rl_coach/agents/actor_critic_agent.py
@@ -0,0 +1,165 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+import scipy.signal
+from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
+from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
+from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
+    AgentParameters, InputEmbedderParameters
+from rl_coach.core_types import QActionStateValue
+from rl_coach.spaces import DiscreteActionSpace
+from rl_coach.utils import last_sample
+
+from rl_coach.logger import screen
+from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
+
+
+class ActorCriticAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.policy_gradient_rescaler = PolicyGradientRescaler.A_VALUE
+        self.apply_gradients_every_x_episodes = 5
+        self.beta_entropy = 0
+        self.num_steps_between_gradient_updates = 5000  # this is called t_max in all the papers
+        self.gae_lambda = 0.96
+        self.estimate_state_value_using_gae = False
+
+
+class ActorCriticNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [VHeadParameters(), PolicyHeadParameters()]
+        self.loss_weights = [0.5, 1.0]
+        self.rescale_gradient_from_head_by_factor = [1, 1]
+        self.optimizer_type = 'Adam'
+        self.clip_gradients = 40.0
+        self.async_training = True
+
+
+class ActorCriticAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=ActorCriticAlgorithmParameters(),
+                         exploration=None, #TODO this should be different for continuous (ContinuousEntropyExploration)
+                                           #  and discrete (CategoricalExploration) action spaces.
+                         memory=SingleEpisodeBufferParameters(),
+                         networks={"main": ActorCriticNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.actor_critic_agent:ActorCriticAgent'
+
+
+# Actor Critic - https://arxiv.org/abs/1602.01783
+class ActorCriticAgent(PolicyOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.last_gradient_update_step_idx = 0
+        self.action_advantages = self.register_signal('Advantages')
+        self.state_values = self.register_signal('Values')
+        self.value_loss = self.register_signal('Value Loss')
+        self.policy_loss = self.register_signal('Policy Loss')
+
+    # Discounting function used to calculate discounted returns.
+    def discount(self, x, gamma):
+        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+    def get_general_advantage_estimation_values(self, rewards, values):
+        # values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
+        bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
+
+        # Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
+        # although in practice works even in much smaller Tmax values, e.g. 20)
+        deltas = rewards + self.ap.algorithm.discount * values[1:] - values[:-1]
+        gae = self.discount(deltas, self.ap.algorithm.discount * self.ap.algorithm.gae_lambda)
+
+        if self.ap.algorithm.estimate_state_value_using_gae:
+            discounted_returns = np.expand_dims(gae + values[:-1], -1)
+        else:
+            discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
+                                                                       self.ap.algorithm.discount)), 1)[:-1]
+        return gae, discounted_returns
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # get the values for the current states
+
+        result = self.networks['main'].online_network.predict(batch.states(network_keys))
+        current_state_values = result[0]
+
+        self.state_values.add_sample(current_state_values)
+
+        # the targets for the state value estimator
+        num_transitions = batch.size
+        state_value_head_targets = np.zeros((num_transitions, 1))
+
+        # estimate the advantage function
+        action_advantages = np.zeros((num_transitions, 1))
+
+        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
+            if batch.game_overs()[-1]:
+                R = 0
+            else:
+                R = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
+
+            for i in reversed(range(num_transitions)):
+                R = batch.rewards()[i] + self.ap.algorithm.discount * R
+                state_value_head_targets[i] = R
+                action_advantages[i] = R - current_state_values[i]
+
+        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
+            # get bootstraps
+            bootstrapped_value = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
+            values = np.append(current_state_values, bootstrapped_value)
+            if batch.game_overs()[-1]:
+                values[-1] = 0
+
+            # get general discounted returns table
+            gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(batch.rewards(), values)
+            action_advantages = np.vstack(gae_values)
+        else:
+            screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        action_advantages = action_advantages.squeeze(axis=-1)
+        actions = batch.actions()
+        if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) < 2:
+            actions = np.expand_dims(actions, -1)
+
+        # train
+        result = self.networks['main'].online_network.accumulate_gradients({**batch.states(network_keys),
+                                                                            'output_1_0': actions},
+                                                                       [state_value_head_targets, action_advantages])
+
+        # logging
+        total_loss, losses, unclipped_grads = result[:3]
+        self.action_advantages.add_sample(action_advantages)
+        self.unclipped_grads.add_sample(unclipped_grads)
+        self.value_loss.add_sample(losses[0])
+        self.policy_loss.add_sample(losses[1])
+
+        return total_loss, losses, unclipped_grads
+
+    def get_prediction(self, states):
+        tf_input_state = self.prepare_batch_for_inference(states, "main")
+        return self.networks['main'].online_network.predict(tf_input_state)[1:]  # index 0 is the state value
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -0,0 +1,791 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+import random
+from collections import OrderedDict
+from typing import Dict, List, Union, Tuple
+
+import numpy as np
+
+from rl_coach.agents.agent_interface import AgentInterface
+from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
+from rl_coach.core_types import RunPhase, PredictionType, EnvironmentEpisodes, ActionType, Batch, Episode, StateType
+from rl_coach.core_types import Transition, ActionInfo, TrainingSteps, EnvironmentSteps, EnvResponse
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
+from pandas import read_pickle
+from six.moves import range
+from rl_coach.spaces import SpacesDefinition, VectorObservationSpace, GoalsSpace, AttentionActionSpace
+from rl_coach.utils import Signal, force_list, set_cpu
+from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
+
+from rl_coach.architectures.network_wrapper import NetworkWrapper
+from rl_coach.logger import screen, Logger, EpisodeLogger
+
+
+class Agent(AgentInterface):
+    def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        """
+        :param agent_parameters: A Preset class instance with all the running paramaters
+        """
+        super().__init__()
+        self.ap = agent_parameters
+        self.task_id = self.ap.task_parameters.task_index
+        self.is_chief = self.task_id == 0
+        self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
+                             and self.ap.memory.shared_memory
+        if self.shared_memory:
+            self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
+        self.name = agent_parameters.name
+        self.parent = parent
+        self.parent_level_manager = None
+        self.full_name_id = agent_parameters.full_name_id = self.name
+
+        if type(agent_parameters.task_parameters) == DistributedTaskParameters:
+            screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
+                             "tensorflow wake up time)".format(self.full_name_id, self.task_id))
+        else:
+            screen.log_title("Creating agent - name: {}".format(self.full_name_id))
+        self.imitation = False
+        self.agent_logger = Logger()
+        self.agent_episode_logger = EpisodeLogger()
+
+        # get the memory
+        # - distributed training + shared memory:
+        #   * is chief?  -> create the memory and add it to the scratchpad
+        #   * not chief? -> wait for the chief to create the memory and then fetch it
+        # - non distributed training / not shared memory:
+        #   * create memory
+        memory_name = self.ap.memory.path.split(':')[1]
+        self.memory_lookup_name = self.full_name_id + '.' + memory_name
+        if self.shared_memory and not self.is_chief:
+            self.memory = self.shared_memory_scratchpad.get(self.memory_lookup_name)
+        else:
+            # modules
+            if agent_parameters.memory.load_memory_from_file_path:
+                screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
+                                 .format(agent_parameters.memory.load_memory_from_file_path))
+                self.memory = read_pickle(agent_parameters.memory.load_memory_from_file_path)
+            else:
+                self.memory = dynamic_import_and_instantiate_module_from_params(self.ap.memory)
+
+            if self.shared_memory and self.is_chief:
+                self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory)
+
+        # set devices
+        if type(agent_parameters.task_parameters) == DistributedTaskParameters:
+            self.has_global = True
+            self.replicated_device = agent_parameters.task_parameters.device
+            self.worker_device = "/job:worker/task:{}".format(self.task_id)
+        else:
+            self.has_global = False
+            self.replicated_device = None
+            self.worker_device = ""
+        if agent_parameters.task_parameters.use_cpu:
+            self.worker_device += "/cpu:0"
+        else:
+            self.worker_device += "/device:GPU:0"
+
+        # filters
+        self.input_filter = self.ap.input_filter
+        self.output_filter = self.ap.output_filter
+        self.pre_network_filter = self.ap.pre_network_filter
+        device = self.replicated_device if self.replicated_device else self.worker_device
+        self.input_filter.set_device(device)
+        self.output_filter.set_device(device)
+        self.pre_network_filter.set_device(device)
+
+
+        # initialize all internal variables
+        self._phase = RunPhase.HEATUP
+        self.total_shaped_reward_in_current_episode = 0
+        self.total_reward_in_current_episode = 0
+        self.total_steps_counter = 0
+        self.running_reward = None
+        self.training_iteration = 0
+        self.last_target_network_update_step = 0
+        self.last_training_phase_step = 0
+        self.current_episode = self.ap.current_episode = 0
+        self.curr_state = {}
+        self.current_hrl_goal = None
+        self.current_episode_steps_counter = 0
+        self.episode_running_info = {}
+        self.last_episode_evaluation_ran = 0
+        self.running_observations = []
+        self.agent_logger.set_current_time(self.current_episode)
+        self.exploration_policy = None
+        self.networks = {}
+        self.last_action_info = None
+        self.running_observation_stats = None
+        self.running_reward_stats = None
+        self.accumulated_rewards_across_evaluation_episodes = 0
+        self.accumulated_shaped_rewards_across_evaluation_episodes = 0
+        self.num_successes_across_evaluation_episodes = 0
+        self.num_evaluation_episodes_completed = 0
+        self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
+        # TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)
+
+        # environment parameters
+        self.spaces = None
+        self.in_action_space = self.ap.algorithm.in_action_space
+
+        # signals
+        self.episode_signals = []
+        self.step_signals = []
+        self.loss = self.register_signal('Loss')
+        self.curr_learning_rate = self.register_signal('Learning Rate')
+        self.unclipped_grads = self.register_signal('Grads (unclipped)')
+        self.reward = self.register_signal('Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
+        self.shaped_reward = self.register_signal('Shaped Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
+        if isinstance(self.in_action_space, GoalsSpace):
+            self.distance_from_goal = self.register_signal('Distance From Goal', dump_one_value_per_step=True)
+
+        # use seed
+        if self.ap.task_parameters.seed is not None:
+            random.seed(self.ap.task_parameters.seed)
+            np.random.seed(self.ap.task_parameters.seed)
+
+    @property
+    def parent(self):
+        """
+        Get the parent class of the agent
+        :return: the current phase
+        """
+        return self._parent
+
+    @parent.setter
+    def parent(self, val):
+        """
+        Change the parent class of the agent.
+        Additionally, updates the full name of the agent
+        :param val: the new parent
+        :return: None
+        """
+        self._parent = val
+        if self._parent is not None:
+            if not hasattr(self._parent, 'name'):
+                raise ValueError("The parent of an agent must have a name")
+            self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
+
+    def setup_logger(self):
+        # dump documentation
+        logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
+            format(graph_name=self.parent_level_manager.parent_graph_manager.name,
+                   level_name=self.parent_level_manager.name,
+                   agent_full_id='.'.join(self.full_name_id.split('/')))
+        self.agent_logger.set_logger_filenames(self.ap.task_parameters.experiment_path, logger_prefix=logger_prefix,
+                                               add_timestamp=True, task_id=self.task_id)
+        if self.ap.visualization.dump_in_episode_signals:
+            self.agent_episode_logger.set_logger_filenames(self.ap.task_parameters.experiment_path,
+                                                           logger_prefix=logger_prefix,
+                                                           add_timestamp=True, task_id=self.task_id)
+
+    def set_session(self, sess) -> None:
+        """
+        Set the deep learning framework session for all the agents in the composite agent
+        :return: None
+        """
+        self.input_filter.set_session(sess)
+        self.output_filter.set_session(sess)
+        self.pre_network_filter.set_session(sess)
+        [network.set_session(sess) for network in self.networks.values()]
+
+    def register_signal(self, signal_name: str, dump_one_value_per_episode: bool=True,
+                        dump_one_value_per_step: bool=False) -> Signal:
+        """
+        Register a signal such that its statistics will be dumped and be viewable through dashboard
+        :param signal_name: the name of the signal as it will appear in dashboard
+        :param dump_one_value_per_episode: should the signal value be written for each episode?
+        :param dump_one_value_per_step: should the signal value be written for each step?
+        :return: the created signal
+        """
+        signal = Signal(signal_name)
+        if dump_one_value_per_episode:
+            self.episode_signals.append(signal)
+        if dump_one_value_per_step:
+            self.step_signals.append(signal)
+        return signal
+
+    def set_environment_parameters(self, spaces: SpacesDefinition):
+        """
+        Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
+        dependent on those values, by calling init_environment_dependent_modules
+        :param spaces: the environment spaces definition
+        :return: None
+        """
+        self.spaces = copy.deepcopy(spaces)
+
+        if self.ap.algorithm.use_accumulated_reward_as_measurement:
+            if 'measurements' in self.spaces.state.sub_spaces:
+                self.spaces.state['measurements'].shape += 1
+                self.spaces.state['measurements'].measurements_names += ['accumulated_reward']
+            else:
+                self.spaces.state['measurements'] = VectorObservationSpace(1, measurements_names=['accumulated_reward'])
+
+        for observation_name in self.spaces.state.sub_spaces.keys():
+            self.spaces.state[observation_name] = \
+                self.pre_network_filter.get_filtered_observation_space(observation_name,
+                    self.input_filter.get_filtered_observation_space(observation_name,
+                                                                     self.spaces.state[observation_name]))
+
+        self.spaces.reward = self.pre_network_filter.get_filtered_reward_space(
+            self.input_filter.get_filtered_reward_space(self.spaces.reward))
+
+        self.spaces.action = self.output_filter.get_unfiltered_action_space(self.spaces.action)
+
+        if isinstance(self.in_action_space, GoalsSpace):
+            # TODO: what if the goal type is an embedding / embedding change?
+            self.spaces.goal = self.in_action_space
+            self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
+
+        self.init_environment_dependent_modules()
+
+    def create_networks(self) -> Dict[str, NetworkWrapper]:
+        """
+        Create all the networks of the agent.
+        The network creation will be done after setting the environment parameters for the agent, since they are needed
+        for creating the network.
+        :return: A list containing all the networks
+        """
+        networks = {}
+        for network_name in sorted(self.ap.network_wrappers.keys()):
+            networks[network_name] = NetworkWrapper(name=network_name,
+                                                    agent_parameters=self.ap,
+                                                    has_target=self.ap.network_wrappers[network_name].create_target_network,
+                                                    has_global=self.has_global,
+                                                    spaces=self.spaces,
+                                                    replicated_device=self.replicated_device,
+                                                    worker_device=self.worker_device)
+        return networks
+
+    def init_environment_dependent_modules(self) -> None:
+        """
+        Initialize any modules that depend on knowing information about the environment such as the action space or
+        the observation space
+        :return: None
+        """
+        # initialize exploration policy
+        self.ap.exploration.action_space = self.spaces.action
+        self.exploration_policy = dynamic_import_and_instantiate_module_from_params(self.ap.exploration)
+
+        # create all the networks of the agent
+        self.networks = self.create_networks()
+
+    @property
+    def phase(self) -> RunPhase:
+        return self._phase
+
+    @phase.setter
+    def phase(self, val: RunPhase) -> None:
+        """
+        Change the phase of the run for the agent and all the sub components
+        :param phase: the new run phase (TRAIN, TEST, etc.)
+        :return: None
+        """
+        self.reset_evaluation_state(val)
+        self._phase = val
+        self.exploration_policy.change_phase(val)
+
+    def reset_evaluation_state(self, val: RunPhase) -> None:
+        starting_evaluation = (val == RunPhase.TEST)
+        ending_evaluation = (self.phase == RunPhase.TEST)
+
+        if starting_evaluation:
+            self.accumulated_rewards_across_evaluation_episodes = 0
+            self.accumulated_shaped_rewards_across_evaluation_episodes = 0
+            self.num_successes_across_evaluation_episodes = 0
+            self.num_evaluation_episodes_completed = 0
+            if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+                screen.log_title("{}: Starting evaluation phase".format(self.name))
+
+        elif ending_evaluation:
+            # we write to the next episode, because it could be that the current episode was already written
+            # to disk and then we won't write it again
+            self.agent_logger.set_current_time(self.current_episode + 1)
+            self.agent_logger.create_signal_value(
+                'Evaluation Reward',
+                self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
+            self.agent_logger.create_signal_value(
+                'Shaped Evaluation Reward',
+                self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
+            success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
+            self.agent_logger.create_signal_value(
+                "Success Rate",
+                success_rate
+            )
+            if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+                screen.log_title("{}: Finished evaluation phase. Success rate = {}"
+                             .format(self.name, np.round(success_rate, 2)))
+
+    def call_memory(self, func, args=()):
+        """
+        This function is a wrapper to allow having the same calls for shared or unshared memories.
+        It should be used instead of calling the memory directly in order to allow different algorithms to work
+        both with a shared and a local memory.
+        :param func: the name of the memory function to call
+        :param args: the arguments to supply to the function
+        :return: the return value of the function
+        """
+        if self.shared_memory:
+            result = self.shared_memory_scratchpad.internal_call(self.memory_lookup_name, func, args)
+        else:
+            if type(args) != tuple:
+                args = (args,)
+            result = getattr(self.memory, func)(*args)
+        return result
+
+    def log_to_screen(self):
+        # log to screen
+        log = OrderedDict()
+        log["Name"] = self.full_name_id
+        if self.task_id is not None:
+            log["Worker"] = self.task_id
+        log["Episode"] = self.current_episode
+        log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
+        log["Exploration"] = np.round(self.exploration_policy.get_control_param(), 2)
+        log["Steps"] = self.total_steps_counter
+        log["Training iteration"] = self.training_iteration
+        screen.log_dict(log, prefix=self.phase.value)
+
+    def update_step_in_episode_log(self):
+        """
+        Writes logging messages to screen and updates the log file with all the signal values.
+        :return: None
+        """
+        # log all the signals to file
+        self.agent_episode_logger.set_current_time(self.current_episode_steps_counter)
+        self.agent_episode_logger.create_signal_value('Training Iter', self.training_iteration)
+        self.agent_episode_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
+        self.agent_episode_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
+        self.agent_episode_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
+        self.agent_episode_logger.create_signal_value('Total steps', self.total_steps_counter)
+        self.agent_episode_logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
+        self.agent_episode_logger.create_signal_value("Shaped Accumulated Reward", self.total_shaped_reward_in_current_episode)
+        self.agent_episode_logger.create_signal_value('Update Target Network', 0, overwrite=False)
+        self.agent_episode_logger.update_wall_clock_time(self.current_episode_steps_counter)
+
+        for signal in self.step_signals:
+            self.agent_episode_logger.create_signal_value(signal.name, signal.get_last_value())
+
+        # dump
+        self.agent_episode_logger.dump_output_csv()
+
+    def update_log(self):
+        """
+        Writes logging messages to screen and updates the log file with all the signal values.
+        :return: None
+        """
+        # log all the signals to file
+        self.agent_logger.set_current_time(self.current_episode)
+        self.agent_logger.create_signal_value('Training Iter', self.training_iteration)
+        self.agent_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
+        self.agent_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
+        self.agent_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
+        self.agent_logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
+        self.agent_logger.create_signal_value('Total steps', self.total_steps_counter)
+        self.agent_logger.create_signal_value("Epsilon", np.mean(self.exploration_policy.get_control_param()))
+        self.agent_logger.create_signal_value("Shaped Training Reward", self.total_shaped_reward_in_current_episode
+                                   if self._phase == RunPhase.TRAIN else np.nan)
+        self.agent_logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
+                                   if self._phase == RunPhase.TRAIN else np.nan)
+
+        self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
+        self.agent_logger.update_wall_clock_time(self.current_episode)
+
+        if self._phase != RunPhase.TEST:
+            self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
+            self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
+            self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
+
+
+        for signal in self.episode_signals:
+            self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
+            self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
+            self.agent_logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
+            self.agent_logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
+
+        # dump
+        if self.current_episode % self.ap.visualization.dump_signals_to_csv_every_x_episodes == 0 \
+                and self.current_episode > 0:
+            self.agent_logger.dump_output_csv()
+
+    def handle_episode_ended(self) -> None:
+        """
+        End an episode
+        :return: None
+        """
+        self.current_episode_buffer.is_complete = True
+
+        if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
+            self.current_episode += 1
+
+        if self.phase != RunPhase.TEST and isinstance(self.memory, EpisodicExperienceReplay):
+            self.call_memory('store_episode', self.current_episode_buffer)
+
+        if self.phase == RunPhase.TEST:
+            self.accumulated_rewards_across_evaluation_episodes += self.total_reward_in_current_episode
+            self.accumulated_shaped_rewards_across_evaluation_episodes += self.total_shaped_reward_in_current_episode
+            self.num_evaluation_episodes_completed += 1
+
+            if self.spaces.reward.reward_success_threshold and \
+                    self.total_reward_in_current_episode >= self.spaces.reward.reward_success_threshold:
+                self.num_successes_across_evaluation_episodes += 1
+
+        if self.ap.visualization.dump_csv:
+            self.update_log()
+
+        if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+            self.log_to_screen()
+
+    def reset_internal_state(self):
+        """
+        Reset all the episodic parameters
+        :return: None
+        """
+        for signal in self.episode_signals:
+            signal.reset()
+        for signal in self.step_signals:
+            signal.reset()
+        self.agent_episode_logger.set_episode_idx(self.current_episode)
+        self.total_shaped_reward_in_current_episode = 0
+        self.total_reward_in_current_episode = 0
+        self.curr_state = {}
+        self.current_episode_steps_counter = 0
+        self.episode_running_info = {}
+        self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
+        if self.exploration_policy:
+            self.exploration_policy.reset()
+        self.input_filter.reset()
+        self.output_filter.reset()
+        self.pre_network_filter.reset()
+        if isinstance(self.memory, EpisodicExperienceReplay):
+            self.call_memory('verify_last_episode_is_closed')
+
+        for network in self.networks.values():
+            network.online_network.reset_internal_memory()
+
+    def learn_from_batch(self, batch) -> Tuple[float, List, List]:
+        """
+        Given a batch of transitions, calculates their target values and updates the network.
+        :param batch: A list of transitions
+        :return: The total loss of the training, the loss per head and the unclipped gradients
+        """
+        return 0, [], []
+
+    def _should_update_online_weights_to_target(self):
+        """
+        Determine if online weights should be copied to the target.
+        :return: boolean: True if the online weights should be copied to the target.
+        """
+        # update the target network of every network that has a target network
+        step_method = self.ap.algorithm.num_steps_between_copying_online_weights_to_target
+        if step_method.__class__ == TrainingSteps:
+            should_update = (self.training_iteration - self.last_target_network_update_step) >= step_method.num_steps
+            if should_update:
+                self.last_target_network_update_step = self.training_iteration
+        elif step_method.__class__ == EnvironmentSteps:
+            should_update = (self.total_steps_counter - self.last_target_network_update_step) >= step_method.num_steps
+            if should_update:
+                self.last_target_network_update_step = self.total_steps_counter
+        else:
+            raise ValueError("The num_steps_between_copying_online_weights_to_target parameter should be either "
+                             "EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
+        return should_update
+
+    def _should_train(self, wait_for_full_episode=False):
+        """
+        Determine if we should start a training phase according to the number of steps passed since the last training
+        :return:  boolean: True if we should start a training phase
+        """
+        step_method = self.ap.algorithm.num_consecutive_playing_steps
+        if step_method.__class__ == EnvironmentEpisodes:
+            should_update = (self.current_episode - self.last_training_phase_step) >= step_method.num_steps
+            if should_update:
+                self.last_training_phase_step = self.current_episode
+        elif step_method.__class__ == EnvironmentSteps:
+            should_update = (self.total_steps_counter - self.last_training_phase_step) >= step_method.num_steps
+            if wait_for_full_episode:
+                should_update = should_update and self.current_episode_steps_counter == 0
+            if should_update:
+                self.last_training_phase_step = self.total_steps_counter
+        else:
+            raise ValueError("The num_consecutive_playing_steps parameter should be either "
+                             "EnvironmentSteps or Episodes. Instead it is {}".format(step_method.__class__))
+        return should_update
+
+    def train(self):
+        """
+        Check if a training phase should be done as configured by num_consecutive_playing_steps.
+        If it should, then do several training steps as configured by num_consecutive_training_steps.
+        A single training iteration: Sample a batch, train on it and update target networks.
+        :return: The total training loss during the training iterations.
+        """
+        loss = 0
+        if self._should_train():
+            for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
+                # TODO: this should be network dependent
+                network_parameters = list(self.ap.network_wrappers.values())[0]
+
+                # update counters
+                self.training_iteration += 1
+
+                # sample a batch and train on it
+                batch = self.call_memory('sample', network_parameters.batch_size)
+                if self.pre_network_filter is not None:
+                    batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False)
+
+                # if the batch returned empty then there are not enough samples in the replay buffer -> skip
+                # training step
+                if len(batch) > 0:
+                    # train
+                    batch = Batch(batch)
+                    total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
+                    loss += total_loss
+                    self.unclipped_grads.add_sample(unclipped_grads)
+
+                    # TODO: the learning rate decay should be done through the network instead of here
+                    # decay learning rate
+                    if network_parameters.learning_rate_decay_rate != 0:
+                        self.curr_learning_rate.add_sample(self.networks['main'].sess.run(
+                            self.networks['main'].online_network.current_learning_rate))
+                    else:
+                        self.curr_learning_rate.add_sample(network_parameters.learning_rate)
+
+                    if any([network.has_target for network in self.networks.values()]) \
+                            and self._should_update_online_weights_to_target():
+                        for network in self.networks.values():
+                            network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
+
+                        self.agent_logger.create_signal_value('Update Target Network', 1)
+                    else:
+                        self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
+
+                    self.loss.add_sample(loss)
+
+                    if self.imitation:
+                        self.log_to_screen()
+
+            # run additional commands after the training is done
+            self.post_training_commands()
+
+        return loss
+
+    def choose_action(self, curr_state):
+        """
+        choose an action to act with in the current episode being played. Different behavior might be exhibited when training
+         or testing.
+
+        :param curr_state: the current state to act upon.
+        :return: chosen action, some action value describing the action (q-value, probability, etc)
+        """
+        pass
+
+    def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
+                                    network_name: str):
+        """
+        convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
+        observations together, measurements together, etc.
+        """
+        # convert to batch so we can run it through the network
+        states = force_list(states)
+        batches_dict = {}
+        for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys():
+            # there are cases (e.g. ddpg) where the state does not contain all the information needed for running
+            # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
+            # addition to the current_state, so that all the inputs of the network will be filled)
+            if key in states[0].keys():
+                batches_dict[key] = np.array([np.array(state[key]) for state in states])
+
+        return batches_dict
+
+    def act(self) -> ActionInfo:
+        """
+        Given the agents current knowledge, decide on the next action to apply to the environment
+        :return: an action and a dictionary containing any additional info from the action decision process
+        """
+        if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
+            # This agent never plays  while training (e.g. behavioral cloning)
+            return None
+
+        # count steps (only when training or if we are in the evaluation worker)
+        if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
+            self.total_steps_counter += 1
+        self.current_episode_steps_counter += 1
+
+        # decide on the action
+        if self.phase == RunPhase.HEATUP and not self.ap.algorithm.heatup_using_network_decisions:
+            # random action
+            self.last_action_info = self.spaces.action.sample_with_info()
+        else:
+            # informed action
+            if self.pre_network_filter is not None:
+                # before choosing an action, first use the pre_network_filter to filter out the current state
+                curr_state = self.run_pre_network_filter_for_inference(self.curr_state)
+
+            else:
+                curr_state = self.curr_state
+            self.last_action_info = self.choose_action(curr_state)
+
+        filtered_action_info = self.output_filter.filter(self.last_action_info)
+
+        return filtered_action_info
+
+    def run_pre_network_filter_for_inference(self, state: StateType):
+        dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
+        return self.pre_network_filter.filter(dummy_env_response)[0].next_state
+
+    def get_state_embedding(self, state: dict) -> np.ndarray:
+        """
+        Given a state, get the corresponding state embedding  from the main network
+        :param state: a state dict
+        :return: a numpy embedding vector
+        """
+        # TODO: this won't work anymore
+        # TODO: instead of the state embedding (which contains the goal) we should use the observation embedding
+        embedding = self.networks['main'].online_network.predict(
+            self.prepare_batch_for_inference(state, "main"),
+            outputs=self.networks['main'].online_network.state_embedding)
+        return embedding
+
+    def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
+        """
+        Allows agents to update the transition just before adding it to the replay buffer.
+        Can be useful for agents that want to tweak the reward, termination signal, etc.
+        :param transition: the transition to update
+        :return: the updated transition
+        """
+        return transition
+
+    def observe(self, env_response: EnvResponse) -> bool:
+        """
+        Given a response from the environment, distill the observation from it and store it for later use.
+        The response should be a dictionary containing the performed action, the new observation and measurements,
+        the reward, a game over flag and any additional information necessary.
+        :param env_response: result of call from environment.step(action)
+        :return:
+        """
+
+        # filter the env_response
+        filtered_env_response = self.input_filter.filter(env_response)[0]
+
+        # inject agent collected statistics, if required
+        if self.ap.algorithm.use_accumulated_reward_as_measurement:
+            if 'measurements' in filtered_env_response.next_state:
+                filtered_env_response.next_state['measurements'] = np.append(filtered_env_response.next_state['measurements'],
+                                                                             self.total_shaped_reward_in_current_episode)
+            else:
+                filtered_env_response.next_state['measurements'] = np.array([self.total_shaped_reward_in_current_episode])
+
+        # if we are in the first step in the episode, then we don't have a a next state and a reward and thus no
+        # transition yet, and therefore we don't need to store anything in the memory.
+        # also we did not reach the goal yet.
+        if self.current_episode_steps_counter == 0:
+            # initialize the current state
+            self.curr_state = filtered_env_response.next_state
+            return env_response.game_over
+        else:
+            transition = Transition(state=copy.copy(self.curr_state), action=self.last_action_info.action,
+                                    reward=filtered_env_response.reward, next_state=filtered_env_response.next_state,
+                                    game_over=filtered_env_response.game_over, info=filtered_env_response.info)
+
+            # now that we have formed a basic transition - the next state progresses to be the current state
+            self.curr_state = filtered_env_response.next_state
+
+            # make agent specific changes to the transition if needed
+            transition = self.update_transition_before_adding_to_replay_buffer(transition)
+
+            # merge the intrinsic reward in
+            if self.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
+                transition.reward = transition.reward * (1 + self.last_action_info.action_intrinsic_reward)
+            else:
+                transition.reward = transition.reward + self.last_action_info.action_intrinsic_reward
+
+            # sum up the total shaped reward
+            self.total_shaped_reward_in_current_episode += transition.reward
+            self.total_reward_in_current_episode += env_response.reward
+            self.shaped_reward.add_sample(transition.reward)
+            self.reward.add_sample(env_response.reward)
+
+            # add action info to transition
+            if type(self.parent).__name__ == 'CompositeAgent':
+                transition.add_info(self.parent.last_action_info.__dict__)
+            else:
+                transition.add_info(self.last_action_info.__dict__)
+
+            # create and store the transition
+            if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
+                # for episodic memories we keep the transitions in a local buffer until the episode is ended.
+                # for regular memories we insert the transitions directly to the memory
+                if isinstance(self.memory, EpisodicExperienceReplay):
+                    self.current_episode_buffer.insert(transition)
+                else:
+                    self.call_memory('store', transition)
+
+            if self.ap.visualization.dump_in_episode_signals:
+                self.update_step_in_episode_log()
+
+            return transition.game_over
+
+    def post_training_commands(self):
+        pass
+
+    def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
+        """
+        Get a prediction from the agent with regard to the requested prediction_type.
+        If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
+        raise a ValueException.
+        :param states:
+        :param prediction_type:
+        :return:
+        """
+
+        predictions = self.networks['main'].online_network.predict_with_prediction_type(
+            # states=self.dict_state_to_batches_dict(states, 'main'), prediction_type=prediction_type)
+            states=states, prediction_type=prediction_type)
+
+        if len(predictions.keys()) != 1:
+            raise ValueError("The network has more than one component {} matching the requested prediction_type {}. ".
+                             format(list(predictions.keys()), prediction_type))
+        return list(predictions.values())[0]
+
+    def set_incoming_directive(self, action: ActionType) -> None:
+        if isinstance(self.in_action_space, GoalsSpace):
+            self.current_hrl_goal = action
+        elif isinstance(self.in_action_space, AttentionActionSpace):
+            self.input_filter.observation_filters['attention'].crop_low = action[0]
+            self.input_filter.observation_filters['attention'].crop_high = action[1]
+            self.output_filter.action_filters['masking'].set_masking(action[0], action[1])
+
+    def save_checkpoint(self, checkpoint_id: int) -> None:
+        """
+        Allows agents to store additional information when saving checkpoints.
+        :param checkpoint_id: the id of the checkpoint
+        :return: None
+        """
+        pass
+
+    def sync(self) -> None:
+        """
+        Sync the global network parameters to local networks
+        :return: None
+        """
+        for network in self.networks.values():
+            network.sync()
+
+
+
+
+
--- a/rl_coach/agents/agent_interface.py
+++ b/rl_coach/agents/agent_interface.py
@@ -0,0 +1,125 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union, List, Dict
+
+import numpy as np
+
+from rl_coach.core_types import EnvResponse, ActionInfo, RunPhase, PredictionType, ActionType
+
+
+class AgentInterface(object):
+    def __init__(self):
+        self._phase = RunPhase.HEATUP
+        self._parent = None
+        self.spaces = None
+
+    @property
+    def parent(self):
+        """
+        Get the parent class of the agent
+        :return: the current phase
+        """
+        return self._parent
+
+    @parent.setter
+    def parent(self, val):
+        """
+        Change the parent class of the agent
+        :param val: the new parent
+        :return: None
+        """
+        self._parent = val
+
+    @property
+    def phase(self) -> RunPhase:
+        """
+        Get the phase of the agent
+        :return: the current phase
+        """
+        return self._phase
+
+    @phase.setter
+    def phase(self, val: RunPhase):
+        """
+        Change the phase of the agent
+        :param val: the new phase
+        :return: None
+        """
+        self._phase = val
+
+    def reset_internal_state(self) -> None:
+        """
+        Reset the episode parameters for the agent
+        :return: None
+        """
+        raise NotImplementedError("")
+
+    def train(self) -> Union[float, List]:
+        """
+        Train the agents network
+        :return: The loss of the training
+        """
+        raise NotImplementedError("")
+
+    def act(self) -> ActionInfo:
+        """
+        Get a decision of the next action to take.
+        The action is dependent on the current state which the agent holds from resetting the environment or from
+        the observe function.
+        :return: A tuple containing the actual action and additional info on the action
+        """
+        raise NotImplementedError("")
+
+    def observe(self, env_response: EnvResponse) -> bool:
+        """
+        Gets a response from the environment.
+        Processes this information for later use. For example, create a transition and store it in memory.
+        The action info (a class containing any info the agent wants to store regarding its action decision process) is
+        stored by the agent itself when deciding on the action.
+        :param env_response: a EnvResponse containing the response from the environment
+        :return: a done signal which is based on the agent knowledge. This can be different from the done signal from
+                 the environment. For example, an agent can decide to finish the episode each time it gets some
+                 intrinsic reward
+        """
+        raise NotImplementedError("")
+
+    def save_checkpoint(self, checkpoint_id: int) -> None:
+        """
+        Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
+        :param checkpoint_id: the checkpoint id to use for saving
+        :return: None
+        """
+        raise NotImplementedError("")
+
+    def get_predictions(self, states: Dict, prediction_type: PredictionType) -> np.ndarray:
+        """
+        Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
+        type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
+        :param states:
+        :param prediction_type:
+        :return: the agent's prediction
+        """
+        raise NotImplementedError("")
+
+    def set_incoming_directive(self, action: ActionType) -> None:
+        """
+        Pass a higher level command (directive) to the agent.
+        For example, a higher level agent can set the goal of the agent.
+        :param action: the directive to pass to the agent
+        :return: None
+        """
+        raise NotImplementedError("")
--- a/rl_coach/agents/bc_agent.py
+++ b/rl_coach/agents/bc_agent.py
@@ -0,0 +1,81 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.imitation_agent import ImitationAgent
+from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+
+from rl_coach.base_parameters import AgentParameters, AlgorithmParameters, NetworkParameters, InputEmbedderParameters, \
+    MiddlewareScheme
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+
+
+class BCAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.collect_new_data = False
+
+
+class BCNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
+        self.heads_parameters = [PolicyHeadParameters()]
+        self.loss_weights = [1.0]
+        self.optimizer_type = 'Adam'
+        self.batch_size = 32
+        self.replace_mse_with_huber_loss = False
+        self.create_target_network = False
+
+
+class BCAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=BCAlgorithmParameters(),
+                         exploration=EGreedyParameters(),
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"main": BCNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.bc_agent:BCAgent'
+
+
+# Behavioral Cloning Agent
+class BCAgent(ImitationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # When using a policy head, the targets refer to the advantages that we are normally feeding the head with.
+        # In this case, we need the policy head to just predict probabilities, so while we usually train the network
+        # with log(Pi)*Advantages, in this specific case we will train it to log(Pi), which after the softmax will
+        # predict Pi (=probabilities)
+        targets = np.ones(batch.actions().shape[0])
+
+        result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
+                                                                'output_0_0': batch.actions()},
+                                                               targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
+
--- a/rl_coach/agents/bootstrapped_dqn_agent.py
+++ b/rl_coach/agents/bootstrapped_dqn_agent.py
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+
+from rl_coach.exploration_policies.bootstrapped import BootstrappedParameters
+
+
+class BootstrappedDQNNetworkParameters(DQNNetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_output_head_copies = 10
+        self.rescale_gradient_from_head_by_factor = [1.0/self.num_output_head_copies]*self.num_output_head_copies
+
+
+class BootstrappedDQNAgentParameters(DQNAgentParameters):
+    def __init__(self):
+        super().__init__()
+        self.network_wrappers = {"main": BootstrappedDQNNetworkParameters()}
+        self.exploration = BootstrappedParameters()
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.bootstrapped_dqn_agent:BootstrappedDQNAgent'
+
+
+# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
+class BootstrappedDQNAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+    def reset_internal_state(self):
+        super().reset_internal_state()
+        self.exploration_policy.select_head()
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        next_states_online_values = self.networks['main'].online_network.predict(batch.next_states(network_keys))
+        result = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+        q_st_plus_1 = result[:self.ap.exploration.architecture_num_q_heads]
+        TD_targets = result[self.ap.exploration.architecture_num_q_heads:]
+
+        # initialize with the current prediction so that we will
+        #  only update the action that we have actually done in this transition
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            mask = batch[i].info['mask']
+            for head_idx in range(self.ap.exploration.architecture_num_q_heads):
+                if mask[head_idx] == 1:
+                    selected_action = np.argmax(next_states_online_values[head_idx][i], 0)
+                    TD_targets[head_idx][i, batch.actions()[i]] = \
+                        batch.rewards()[i] + (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount \
+                                     * q_st_plus_1[head_idx][i][selected_action]
+
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
+
+    def observe(self, env_response):
+        mask = np.random.binomial(1, self.ap.exploration.bootstrapped_data_sharing_probability,
+                                  self.ap.exploration.architecture_num_q_heads)
+        env_response.info['mask'] = mask
+        return super().observe(env_response)
--- a/rl_coach/agents/categorical_dqn_agent.py
+++ b/rl_coach/agents/categorical_dqn_agent.py
@@ -0,0 +1,114 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.dqn_agent import DQNNetworkParameters, DQNAlgorithmParameters
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.architectures.tensorflow_components.heads.categorical_q_head import CategoricalQHeadParameters
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
+from rl_coach.schedules import LinearSchedule
+
+from rl_coach.core_types import StateType
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+
+
+class CategoricalDQNNetworkParameters(DQNNetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.heads_parameters = [CategoricalQHeadParameters()]
+
+
+class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.v_min = -10.0
+        self.v_max = 10.0
+        self.atoms = 51
+
+
+class CategoricalDQNExplorationParameters(EGreedyParameters):
+    def __init__(self):
+        super().__init__()
+        self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
+        self.evaluation_epsilon = 0.001
+
+
+class CategoricalDQNAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),
+                         exploration=CategoricalDQNExplorationParameters(),
+                         memory=ExperienceReplayParameters(),
+                         networks={"main": CategoricalDQNNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.categorical_dqn_agent:CategoricalDQNAgent'
+
+
+# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
+class CategoricalDQNAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)
+
+    def distribution_prediction_to_q_values(self, prediction):
+        return np.dot(prediction, self.z_values)
+
+    # prediction's format is (batch,actions,atoms)
+    def get_all_q_values_for_states(self, states: StateType):
+        if self.exploration_policy.requires_action_values():
+            prediction = self.get_prediction(states)
+            q_values = self.distribution_prediction_to_q_values(prediction)
+        else:
+            q_values = None
+        return q_values
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # for the action we actually took, the error is calculated by the atoms distribution
+        # for all other actions, the error is 0
+        distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+
+        # only update the action that we have actually done in this transition
+        target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)
+        m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))
+
+        batches = np.arange(self.ap.network_wrappers['main'].batch_size)
+        for j in range(self.z_values.size):
+            tzj = np.fmax(np.fmin(batch.rewards() +
+                                  (1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],
+                                  self.z_values[self.z_values.size - 1]),
+                          self.z_values[0])
+            bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
+            u = (np.ceil(bj)).astype(int)
+            l = (np.floor(bj)).astype(int)
+            m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
+            m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
+        # total_loss = cross entropy between actual result above and predicted result for the given action
+        TD_targets[batches, batch.actions()] = m
+
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
+
--- a/rl_coach/agents/clipped_ppo_agent.py
+++ b/rl_coach/agents/clipped_ppo_agent.py
@@ -0,0 +1,277 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from collections import OrderedDict
+from random import shuffle
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.actor_critic_agent import ActorCriticAgent
+from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
+from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
+    AgentParameters, InputEmbedderParameters
+from rl_coach.core_types import EnvironmentSteps, Batch, EnvResponse, StateType
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.schedules import ConstantSchedule
+from rl_coach.spaces import DiscreteActionSpace
+
+from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
+from rl_coach.logger import screen
+
+
+class ClippedPPONetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
+        self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
+        self.heads_parameters = [VHeadParameters(), PPOHeadParameters()]
+        self.loss_weights = [1.0, 1.0]
+        self.rescale_gradient_from_head_by_factor = [1, 1]
+        self.batch_size = 64
+        self.optimizer_type = 'Adam'
+        self.clip_gradients = None
+        self.use_separate_networks_per_head = True
+        self.async_training = False
+        self.l2_regularization = 0
+        self.create_target_network = True
+        self.shared_optimizer = True
+        self.scale_down_gradients_by_number_of_workers_for_sync_training = True
+
+
+class ClippedPPOAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_episodes_in_experience_replay = 1000000
+        self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
+        self.gae_lambda = 0.95
+        self.use_kl_regularization = False
+        self.clip_likelihood_ratio_using_epsilon = 0.2
+        self.estimate_state_value_using_gae = True
+        self.step_until_collecting_full_episodes = True
+        self.beta_entropy = 0.01  # should be 0 for mujoco
+        self.num_consecutive_playing_steps = EnvironmentSteps(2048)
+        self.optimization_epochs = 10
+        self.normalization_stats = None
+        self.clipping_decay_schedule = ConstantSchedule(1)
+
+
+class ClippedPPOAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
+                         exploration=AdditiveNoiseParameters(),
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"main": ClippedPPONetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.clipped_ppo_agent:ClippedPPOAgent'
+
+
+# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
+class ClippedPPOAgent(ActorCriticAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        # signals definition
+        self.value_loss = self.register_signal('Value Loss')
+        self.policy_loss = self.register_signal('Policy Loss')
+        self.total_kl_divergence_during_training_process = 0.0
+        self.unclipped_grads = self.register_signal('Grads (unclipped)')
+        self.value_targets = self.register_signal('Value Targets')
+        self.kl_divergence = self.register_signal('KL Divergence')
+        self.likelihood_ratio = self.register_signal('Likelihood Ratio')
+        self.clipped_likelihood_ratio = self.register_signal('Clipped Likelihood Ratio')
+
+
+    def set_session(self, sess):
+        super().set_session(sess)
+        if self.ap.algorithm.normalization_stats is not None:
+            self.ap.algorithm.normalization_stats.set_session(sess)
+
+    def fill_advantages(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        current_state_values = self.networks['main'].online_network.predict(batch.states(network_keys))[0]
+        current_state_values = current_state_values.squeeze()
+        self.state_values.add_sample(current_state_values)
+
+        # calculate advantages
+        advantages = []
+        value_targets = []
+        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
+            advantages = batch.total_returns() - current_state_values
+        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
+            # get bootstraps
+            episode_start_idx = 0
+            advantages = np.array([])
+            value_targets = np.array([])
+            for idx, game_over in enumerate(batch.game_overs()):
+                if game_over:
+                    # get advantages for the rollout
+                    value_bootstrapping = np.zeros((1,))
+                    rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
+
+                    rollout_advantages, gae_based_value_targets = \
+                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
+                                                                     rollout_state_values)
+                    episode_start_idx = idx + 1
+                    advantages = np.append(advantages, rollout_advantages)
+                    value_targets = np.append(value_targets, gae_based_value_targets)
+        else:
+            screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        # standardize
+        advantages = (advantages - np.mean(advantages)) / np.std(advantages)
+
+        for transition, advantage, value_target in zip(batch.transitions, advantages, value_targets):
+            transition.info['advantage'] = advantage
+            transition.info['gae_based_value_target'] = value_target
+
+        self.action_advantages.add_sample(advantages)
+
+    def train_network(self, batch, epochs):
+        batch_results = []
+        for j in range(epochs):
+            batch.shuffle()
+            batch_results = {
+                'total_loss': [],
+                'losses': [],
+                'unclipped_grads': [],
+                'kl_divergence': [],
+                'entropy': []
+            }
+
+            fetches = [self.networks['main'].online_network.output_heads[1].kl_divergence,
+                       self.networks['main'].online_network.output_heads[1].entropy,
+                       self.networks['main'].online_network.output_heads[1].likelihood_ratio,
+                       self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
+
+            for i in range(int(batch.size / self.ap.network_wrappers['main'].batch_size)):
+                start = i * self.ap.network_wrappers['main'].batch_size
+                end = (i + 1) * self.ap.network_wrappers['main'].batch_size
+
+                network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+                actions = batch.actions()[start:end]
+                gae_based_value_targets = batch.info('gae_based_value_target')[start:end]
+                if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
+                    actions = np.expand_dims(actions, -1)
+
+                # get old policy probabilities and distribution
+
+                # TODO-perf - the target network ("old_policy") is not changing. this can be calculated once for all epochs.
+                # the shuffling being done, should only be performed on the indices.
+                result = self.networks['main'].target_network.predict({k: v[start:end] for k, v in batch.states(network_keys).items()})
+                old_policy_distribution = result[1:]
+
+                # calculate gradients and apply on both the local policy network and on the global policy network
+                if self.ap.algorithm.estimate_state_value_using_gae:
+                    value_targets = np.expand_dims(gae_based_value_targets, -1)
+                else:
+                    value_targets = batch.total_returns(expand_dims=True)[start:end]
+
+                inputs = copy.copy({k: v[start:end] for k, v in batch.states(network_keys).items()})
+                inputs['output_1_0'] = actions
+
+                # The old_policy_distribution needs to be represented as a list, because in the event of
+                # discrete controls, it has just a mean. otherwise, it has both a mean and standard deviation
+                for input_index, input in enumerate(old_policy_distribution):
+                    inputs['output_1_{}'.format(input_index + 1)] = input
+
+                inputs['output_1_3'] = self.ap.algorithm.clipping_decay_schedule.current_value
+
+                total_loss, losses, unclipped_grads, fetch_result = \
+                    self.networks['main'].train_and_sync_networks(
+                        inputs, [value_targets, batch.info('advantage')[start:end]], additional_fetches=fetches
+                    )
+
+                batch_results['total_loss'].append(total_loss)
+                batch_results['losses'].append(losses)
+                batch_results['unclipped_grads'].append(unclipped_grads)
+                batch_results['kl_divergence'].append(fetch_result[0])
+                batch_results['entropy'].append(fetch_result[1])
+
+                self.unclipped_grads.add_sample(unclipped_grads)
+                self.value_targets.add_sample(value_targets)
+                self.likelihood_ratio.add_sample(fetch_result[2])
+                self.clipped_likelihood_ratio.add_sample(fetch_result[3])
+
+            for key in batch_results.keys():
+                batch_results[key] = np.mean(batch_results[key], 0)
+
+            self.value_loss.add_sample(batch_results['losses'][0])
+            self.policy_loss.add_sample(batch_results['losses'][1])
+
+            if self.ap.network_wrappers['main'].learning_rate_decay_rate != 0:
+                curr_learning_rate = self.networks['main'].online_network.get_variable_value(
+                    self.networks['main'].online_network.adaptive_learning_rate_scheme)
+                self.curr_learning_rate.add_sample(curr_learning_rate)
+            else:
+                curr_learning_rate = self.ap.network_wrappers['main'].learning_rate
+
+            # log training parameters
+            screen.log_dict(
+                OrderedDict([
+                    ("Surrogate loss", batch_results['losses'][1]),
+                    ("KL divergence", batch_results['kl_divergence']),
+                    ("Entropy", batch_results['entropy']),
+                    ("training epoch", j),
+                    ("learning_rate", curr_learning_rate)
+                ]),
+                prefix="Policy training"
+            )
+
+        self.total_kl_divergence_during_training_process = batch_results['kl_divergence']
+        self.entropy.add_sample(batch_results['entropy'])
+        self.kl_divergence.add_sample(batch_results['kl_divergence'])
+        return batch_results['losses']
+
+    def post_training_commands(self):
+        # clean memory
+        self.call_memory('clean')
+
+    def train(self):
+        if self._should_train(wait_for_full_episode=True):
+            dataset = self.memory.transitions
+            dataset = self.pre_network_filter.filter(dataset, deep_copy=False)
+            batch = Batch(dataset)
+
+            for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
+                self.networks['main'].sync()
+                self.fill_advantages(batch)
+
+                # take only the requested number of steps
+                if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps):
+                    dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
+                shuffle(dataset)
+                batch = Batch(dataset)
+
+                self.train_network(batch, self.ap.algorithm.optimization_epochs)
+
+            self.post_training_commands()
+            self.training_iteration += 1
+            # self.update_log()  # should be done in order to update the data that has been accumulated * while not playing *
+            return None
+
+    def run_pre_network_filter_for_inference(self, state: StateType):
+        dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
+        return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state
+
+    def choose_action(self, curr_state):
+        self.ap.algorithm.clipping_decay_schedule.step()
+        return super().choose_action(curr_state)
--- a/rl_coach/agents/composite_agent.py
+++ b/rl_coach/agents/composite_agent.py
@@ -0,0 +1,415 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+import itertools
+from enum import Enum
+from typing import Union, List, Dict
+
+import numpy as np
+from rl_coach.agents.agent_interface import AgentInterface
+from rl_coach.base_parameters import AgentParameters, VisualizationParameters
+# from rl_coach.environments.environment_interface import ActionSpace
+from rl_coach.spaces import ActionSpace
+from rl_coach.spaces import AgentSelection, AttentionActionSpace, ObservationSpace, SpacesDefinition
+from rl_coach.utils import short_dynamic_import
+
+from rl_coach.core_types import ActionInfo, EnvResponse, ActionType, RunPhase
+from rl_coach.filters.observation.observation_crop_filter import ObservationCropFilter
+
+
+class DecisionPolicy(object):
+    def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
+        """
+        Given a list of actions from multiple agents, decide on a single action to take.
+        :param actions_info: a dictionary of agent names and their corresponding
+                             ActionInfo instances containing information for each agents action
+        :return: a single action and the corresponding action info
+        """
+        raise NotImplementedError("")
+
+
+class SingleDecider(DecisionPolicy):
+    """
+    A decision policy that chooses the action according to the agent that is currently in control.
+    """
+    def __init__(self, default_decision_maker: str):
+        super().__init__()
+        self._decision_maker = default_decision_maker
+
+    @property
+    def decision_maker(self):
+        """
+        Get the decision maker that was set by the upper level control.
+        """
+        return self._decision_maker
+
+    @decision_maker.setter
+    def decision_maker(self, decision_maker: str):
+        """
+        Set the decision maker by the upper level control.
+        :param action: the incoming action from the upper level control.
+        """
+        self._decision_maker = decision_maker
+
+    def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
+        """
+        Given a list of actions from multiple agents, take the action of the current decision maker
+        :param actions_info: a list of ActionInfo instances containing the information for each agents action
+        :return: a single action
+        """
+        if self.decision_maker not in actions_info.keys():
+            raise ValueError("The current decision maker ({}) does not exist in the given actions ({})"
+                             .format(self.decision_maker, actions_info.keys()))
+        return actions_info[self.decision_maker]
+
+
+class RoundRobin(DecisionPolicy):
+    """
+    A decision policy that chooses the action according to agents selected in a circular order.
+    """
+    def __init__(self, num_agents: int):
+        super().__init__()
+        self.round_robin = itertools.cycle(range(num_agents))
+
+    def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
+        """
+        Given a list of actions from multiple agents, take the action of the current decision maker, which is set in a
+         circular order
+        :param actions_info: a list of ActionInfo instances containing the information for each agents action
+        :return: a single action
+        """
+        decision_maker = self.round_robin.__next__()
+        if decision_maker not in range(len(actions_info.keys())):
+            raise ValueError("The size of action_info does not match the number of agents set to RoundRobin decision"
+                             " policy.")
+        return actions_info.items()[decision_maker]
+
+
+class MajorityVote(DecisionPolicy):
+    """
+    A decision policy that chooses the action that most of the agents chose.
+    This policy is only useful for discrete control.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
+        """
+        Given a list of actions from multiple agents, take the action that most agents agree on
+        :param actions_info: a list of ActionInfo instances containing the information for each agents action
+        :return: a single action
+        """
+        # TODO: enforce discrete action spaces
+        if len(actions_info.keys()) == 0:
+            raise ValueError("The given list of actions is empty")
+        vote_count = np.bincount([action_info.action for action_info in actions_info.values()])
+        majority_vote = np.argmax(vote_count)
+        return actions_info.items()[majority_vote]
+
+
+class MeanDecision(DecisionPolicy):
+    """
+    A decision policy that takes the mean action given the actions of all the agents.
+    This policy is only useful for continuous control.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
+        """
+        Given a list of actions from multiple agents, take the mean action
+        :param actions_info: a list of ActionInfo instances containing the information for each agents action
+        :return: a single action
+        """
+        # TODO: enforce continuous action spaces
+        if len(actions_info.keys()) == 0:
+            raise ValueError("The given list of actions is empty")
+        mean = np.mean([action_info.action for action_info in actions_info.values()], axis=0)
+        return ActionInfo(mean)
+
+
+class RewardPolicy(Enum):
+    ReachingGoal = 0
+    NativeEnvironmentReward = 1
+    AccumulatedEnvironmentRewards = 2
+
+
+class CompositeAgent(AgentInterface):
+    """
+    A CompositeAgent is a group of agents in the same hierarchy level.
+    In a CompositeAgent, each agent may take the role of either a controller or an observer.
+    Each agent that is defined as observer, gets observations from the environment.
+    Each agent that is defined as controller, can potentially also control the environment, in addition to observing it.
+    There are several ways to decide on the action from different controller agents:
+    1. Ensemble -
+        - Take the majority vote (discrete controls)
+        - Take the mean action (continuous controls)
+        - Round robin between the agents (discrete/continuous)
+    2. Skills -
+        - At each step a single agent decides (Chosen by the uppoer hierarchy controlling agent)
+
+    A CompositeAgent can be controlled using one of the following methods (ActionSpaces):
+    1. Goals (in terms of measurements, observation, embedding or a change in those values)
+    2. Agent Selection (skills) / Discrete action space.
+    3. Attention (a subset of the real environment observation / action space)
+    """
+    def __init__(self,
+                 agents_parameters: Union[AgentParameters, Dict[str, AgentParameters]],
+                 visualization_parameters: VisualizationParameters,
+                 decision_policy: DecisionPolicy,
+                 out_action_space: ActionSpace,
+                 in_action_space: Union[None, ActionSpace]=None,
+                 decision_makers: Union[bool, Dict[str, bool]]=True,
+                 reward_policy: RewardPolicy=RewardPolicy.NativeEnvironmentReward,
+                 name="CompositeAgent"):
+        """
+        Construct an agent group
+        :param agents_parameters: a list of presets describing each one of the agents in the group
+        :param decision_policy: the decision policy of the group which describes how actions are consolidated
+        :param out_action_space: the type of action space that is used by this composite agent in order to control the
+                                 underlying environment
+        :param in_action_space: the type of action space that is used by the upper level agent in order to control this
+                                group
+        :param decision_makers: a list of booleans representing for each corresponding agent if it has a decision
+                                privilege or if it is just an observer
+        :param reward_policy: the type of the reward that the group receives
+        """
+        super().__init__()
+
+        if isinstance(agents_parameters, AgentParameters):
+            decision_makers = {agents_parameters.name: True}
+            agents_parameters = {agents_parameters.name: agents_parameters}
+        self.agents_parameters = agents_parameters
+        self.visualization_parameters = visualization_parameters
+        self.decision_makers = decision_makers
+        self.decision_policy = decision_policy
+        self.in_action_space = in_action_space
+        self.out_action_space = out_action_space  # TODO: this is not being used
+        self.reward_policy = reward_policy
+        self.full_name_id = self.name = name
+        self.current_decision_maker = 0
+        self.environment = None
+        self.agents = {}  # key = agent_name, value = agent
+        self.incoming_action = None
+        self.last_state = None
+        self._phase = RunPhase.HEATUP
+        self.last_action_info = None
+        self.current_episode = 0
+        self.parent_level_manager = None
+
+        # environment spaces
+        self.spaces = None
+
+        # counters for logging
+        self.total_steps_counter = 0
+        self.current_episode_steps_counter = 0
+        self.total_reward_in_current_episode = 0
+
+        # validate input
+        if set(self.decision_makers) != set(self.agents_parameters):
+            raise ValueError("The decision_makers dictionary keys does not match the names of the given agents")
+        if sum(self.decision_makers.values()) > 1 and type(self.decision_policy) == SingleDecider \
+                and type(self.in_action_space) != AgentSelection:
+            raise ValueError("When the control policy is set to single decider, the master policy should control the"
+                             "agent group via agent selection (ControlType.AgentSelection)")
+
+    @property
+    def parent(self):
+        """
+        Get the parent class of the composite agent
+        :return: the current phase
+        """
+        return self._parent
+
+    @parent.setter
+    def parent(self, val):
+        """
+        Change the parent class of the composite agent.
+        Additionally, updates the full name of the agent
+        :param val: the new parent
+        :return: None
+        """
+        self._parent = val
+        if not hasattr(self._parent, 'name'):
+            raise ValueError("The parent of a composite agent must have a name")
+        self.full_name_id = "{}/{}".format(self._parent.name, self.name)
+
+    def create_agents(self):
+        for agent_name, agent_parameters in self.agents_parameters.items():
+            agent_parameters.name = agent_name
+
+            # create agent
+            self.agents[agent_parameters.name] = short_dynamic_import(agent_parameters.path)(agent_parameters,
+                                                                                             parent=self)
+            self.agents[agent_parameters.name].parent_level_manager = self.parent_level_manager
+
+        # TODO: this is a bit too specific to be defined here
+        # add an attention cropping filter if the incoming directives are attention boxes
+        if isinstance(self.in_action_space, AttentionActionSpace):
+            attention_size = self.in_action_space.forced_attention_size
+            for agent in self.agents.values():
+                agent.input_filter.observation_filters['attention'] = \
+                    ObservationCropFilter(crop_low=np.zeros_like(attention_size), crop_high=attention_size)
+                agent.input_filter.observation_filters.move_to_end('attention', last=False)  # add the cropping at the beginning
+
+    def setup_logger(self) -> None:
+        """
+        Setup the logger for all the agents in the composite agent
+        :return: None
+        """
+        [agent.setup_logger() for agent in self.agents.values()]
+
+    def set_session(self, sess) -> None:
+        """
+        Set the deep learning framework session for all the agents in the composite agent
+        :return: None
+        """
+        [agent.set_session(sess) for agent in self.agents.values()]
+
+    def set_environment_parameters(self, spaces: SpacesDefinition):
+        """
+        Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
+        dependent on those values, by calling init_environment_dependent_modules
+        :param spaces: the definitions of all the spaces of the environment
+        :return: None
+        """
+        self.spaces = copy.deepcopy(spaces)
+        [agent.set_environment_parameters(self.spaces) for agent in self.agents.values()]
+
+    @property
+    def phase(self):
+        return self._phase
+
+    @phase.setter
+    def phase(self, val: RunPhase) -> None:
+        """
+        Change the current phase of all the agents in the group
+        :param phase: the new phase
+        :return: None
+        """
+        self._phase = val
+        for agent in self.agents.values():
+            agent.phase = val
+
+    def end_episode(self) -> None:
+        """
+        End an episode
+        :return: None
+        """
+        self.current_episode += 1
+        [agent.handle_episode_ended() for agent in self.agents.values()]
+
+    def reset_internal_state(self) -> None:
+        """
+        Reset the episode for all the agents in the group
+        :return: None
+        """
+        # update counters
+        self.total_steps_counter = 0
+        self.current_episode_steps_counter = 0
+        self.total_reward_in_current_episode = 0
+
+        # reset all sub modules
+        [agent.reset_internal_state() for agent in self.agents.values()]
+
+    def train(self) -> Union[float, List]:
+        """
+        Make a single training step for all the agents of the group
+        :return: a list of loss values from the training step
+        """
+        return [agent.train() for agent in self.agents.values()]
+
+    def act(self) -> ActionInfo:
+        """
+        Get the actions from all the agents in the group. Then use the decision policy in order to
+        extract a single action out of the list of actions.
+        :return: the chosen action and its corresponding information
+        """
+
+        # update counters
+        self.total_steps_counter += 1
+        self.current_episode_steps_counter += 1
+
+        # get the actions info from all the agents
+        actions_info = {}
+        for agent_name, agent in self.agents.items():
+            action_info = agent.act()
+            actions_info[agent_name] = action_info
+
+        # decide on a single action to apply to the environment
+        action_info = self.decision_policy.choose_action(actions_info)
+
+        # TODO: make the last action info a property?
+        # pass the action info to all the observers
+        for agent_name, is_decision_maker in self.decision_makers.items():
+            if not is_decision_maker:
+                self.agents[agent_name].last_action_info = action_info
+        self.last_action_info = action_info
+
+        return self.last_action_info
+
+    def observe(self, env_response: EnvResponse) -> bool:
+        """
+        Given a response from the environment as a env_response, filter it and pass it to the agents.
+        This method has two main jobs:
+        1. Wrap the previous transition, ending with the new observation coming from EnvResponse.
+        2. Save the next_state as the current_state to take action upon for the next call to act().
+
+        :param env_response:
+        :param action_info: additional info about the chosen action
+        :return:
+        """
+
+        # accumulate the unfiltered rewards for visualization
+        self.total_reward_in_current_episode += env_response.reward
+
+        episode_ended = env_response.game_over
+
+        # pass the env_response to all the sub-agents
+        # TODO: what if one agent decides to end the episode but the others don't? who decides?
+        for agent_name, agent in self.agents.items():
+            goal_reached = agent.observe(env_response)
+            episode_ended = episode_ended or goal_reached
+
+        # TODO: unlike for a single agent, here we also treat a game over by the environment.
+        # probably better to only return the agents' goal_reached decisions.
+        return episode_ended
+
+    def save_checkpoint(self, checkpoint_id: int) -> None:
+        [agent.save_checkpoint(checkpoint_id) for agent in self.agents.values()]
+
+    def set_incoming_directive(self, action: ActionType) -> None:
+        self.incoming_action = action
+        if isinstance(self.decision_policy, SingleDecider) and isinstance(self.in_action_space, AgentSelection):
+            self.decision_policy.decision_maker = list(self.agents.keys())[action]
+        if isinstance(self.in_action_space, AttentionActionSpace):
+            # TODO: redesign to be more modular
+            for agent in self.agents.values():
+                agent.input_filter.observation_filters['attention'].crop_low = action[0]
+                agent.input_filter.observation_filters['attention'].crop_high = action[1]
+                agent.output_filter.action_filters['masking'].set_masking(action[0], action[1])
+
+        # TODO  rethink this scheme. we don't want so many if else clauses lying around here.  
+        # TODO - for incoming actions which do not involve setting the acting agent we should change the
+        #  observation_space, goal to pursue, etc accordingly to the incoming action.
+
+    def sync(self) -> None:
+        """
+        Sync the agent networks with the global network
+        :return:
+        """
+        [agent.sync() for agent in self.agents.values()]
--- a/rl_coach/agents/ddpg_agent.py
+++ b/rl_coach/agents/ddpg_agent.py
@@ -0,0 +1,192 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.actor_critic_agent import ActorCriticAgent
+from rl_coach.agents.agent import Agent
+from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
+    AgentParameters, InputEmbedderParameters, EmbedderScheme
+from rl_coach.exploration_policies.ou_process import OUProcessParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.spaces import BoxActionSpace, GoalsSpace
+
+from rl_coach.architectures.tensorflow_components.heads.ddpg_actor_head import DDPGActorHeadParameters
+from rl_coach.core_types import ActionInfo, EnvironmentSteps
+
+
+class DDPGCriticNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True),
+                                            'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [VHeadParameters()]
+        self.loss_weights = [1.0]
+        self.rescale_gradient_from_head_by_factor = [1]
+        self.optimizer_type = 'Adam'
+        self.batch_size = 64
+        self.async_training = False
+        self.learning_rate = 0.001
+        self.create_target_network = True
+        self.shared_optimizer = True
+        self.scale_down_gradients_by_number_of_workers_for_sync_training = False
+
+
+class DDPGActorNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True)}
+        self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
+        self.heads_parameters = [DDPGActorHeadParameters()]
+        self.loss_weights = [1.0]
+        self.rescale_gradient_from_head_by_factor = [1]
+        self.optimizer_type = 'Adam'
+        self.batch_size = 64
+        self.async_training = False
+        self.learning_rate = 0.0001
+        self.create_target_network = True
+        self.shared_optimizer = True
+        self.scale_down_gradients_by_number_of_workers_for_sync_training = False
+
+
+class DDPGAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
+        self.rate_for_copying_weights_to_target = 0.001
+        self.num_consecutive_playing_steps = EnvironmentSteps(1)
+        self.use_target_network_for_evaluation = False
+        self.action_penalty = 0
+        self.clip_critic_targets = None  # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
+        self.use_non_zero_discount_for_terminal_states = False
+
+
+class DDPGAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=DDPGAlgorithmParameters(),
+                         exploration=OUProcessParameters(),
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"actor": DDPGActorNetworkParameters(),
+                                    "critic": DDPGCriticNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.ddpg_agent:DDPGAgent'
+
+
+# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
+class DDPGAgent(ActorCriticAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+        self.q_values = self.register_signal("Q")
+        self.TD_targets_signal = self.register_signal("TD targets")
+        self.action_signal = self.register_signal("actions")
+
+    def learn_from_batch(self, batch):
+        actor = self.networks['actor']
+        critic = self.networks['critic']
+
+        actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
+        critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
+
+        # TD error = r + discount*max(q_st_plus_1) - q_st
+        next_actions, actions_mean = actor.parallel_prediction([
+            (actor.target_network, batch.next_states(actor_keys)),
+            (actor.online_network, batch.states(actor_keys))
+        ])
+
+        critic_inputs = copy.copy(batch.next_states(critic_keys))
+        critic_inputs['action'] = next_actions
+        q_st_plus_1 = critic.target_network.predict(critic_inputs)
+
+        # calculate the bootstrapped TD targets while discounting terminal states according to
+        # use_non_zero_discount_for_terminal_states
+        if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
+            TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
+        else:
+            TD_targets = batch.rewards(expand_dims=True) + \
+                         (1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
+
+        # clip the TD targets to prevent overestimation errors
+        if self.ap.algorithm.clip_critic_targets:
+            TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
+
+        self.TD_targets_signal.add_sample(TD_targets)
+
+        # get the gradients of the critic output with respect to the action
+        critic_inputs = copy.copy(batch.states(critic_keys))
+        critic_inputs['action'] = actions_mean
+        action_gradients = critic.online_network.predict(critic_inputs,
+                                                         outputs=critic.online_network.gradients_wrt_inputs[0]['action'])
+
+        # train the critic
+        critic_inputs = copy.copy(batch.states(critic_keys))
+        critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
+        result = critic.train_and_sync_networks(critic_inputs, TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        # apply the gradients from the critic to the actor
+        initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
+        gradients = actor.online_network.predict(batch.states(actor_keys),
+                                                 outputs=actor.online_network.weighted_gradients[0],
+                                                 initial_feed_dict=initial_feed_dict)
+
+        if actor.has_global:
+            actor.apply_gradients_to_global_network(gradients)
+            actor.update_online_network()
+        else:
+            actor.apply_gradients_to_online_network(gradients)
+
+        return total_loss, losses, unclipped_grads
+
+    def train(self):
+        return Agent.train(self)
+
+    def choose_action(self, curr_state):
+        if not (isinstance(self.spaces.action, BoxActionSpace) or isinstance(self.spaces.action, GoalsSpace)):
+            raise ValueError("DDPG works only for continuous control problems")
+        # convert to batch so we can run it through the network
+        tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor')
+        if self.ap.algorithm.use_target_network_for_evaluation:
+            actor_network = self.networks['actor'].target_network
+        else:
+            actor_network = self.networks['actor'].online_network
+
+        action_values = actor_network.predict(tf_input_state).squeeze()
+
+        action = self.exploration_policy.get_action(action_values)
+
+        self.action_signal.add_sample(action)
+
+        # get q value
+        tf_input_state = self.prepare_batch_for_inference(curr_state, 'critic')
+        action_batch = np.expand_dims(action, 0)
+        if type(action) != np.ndarray:
+            action_batch = np.array([[action]])
+        tf_input_state['action'] = action_batch
+        q_value = self.networks['critic'].online_network.predict(tf_input_state)[0]
+        self.q_values.add_sample(q_value)
+
+        action_info = ActionInfo(action=action,
+                                 action_value=q_value)
+
+        return action_info
--- a/rl_coach/agents/ddqn_agent.py
+++ b/rl_coach/agents/ddqn_agent.py
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.schedules import LinearSchedule
+
+from rl_coach.agents.dqn_agent import DQNAgentParameters
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.core_types import EnvironmentSteps
+
+
+class DDQNAgentParameters(DQNAgentParameters):
+    def __init__(self):
+        super().__init__()
+        self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000)
+        self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
+        self.exploration.evaluation_epsilon = 0.001
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.ddqn_agent:DDQNAgent'
+
+
+# Double DQN - https://arxiv.org/abs/1509.06461
+class DDQNAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
+        q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+
+        # initialize with the current prediction so that we will
+        #  only update the action that we have actually done in this transition
+        TD_errors = []
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            new_target = batch.rewards()[i] + \
+                         (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]]
+            TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
+            TD_targets[i, batch.actions()[i]] = new_target
+
+        # update errors in prioritized replay buffer
+        importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
+
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
+                                                               importance_weights=importance_weights)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
--- a/rl_coach/agents/dfp_agent.py
+++ b/rl_coach/agents/dfp_agent.py
@@ -0,0 +1,219 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from enum import Enum
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.agent import Agent
+from rl_coach.architectures.tensorflow_components.architecture import Conv2d, Dense
+from rl_coach.architectures.tensorflow_components.heads.measurements_prediction_head import MeasurementsPredictionHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
+    InputEmbedderParameters, MiddlewareScheme
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.memories.memory import MemoryGranularity
+from rl_coach.spaces import SpacesDefinition, VectorObservationSpace
+
+from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+
+
+class HandlingTargetsAfterEpisodeEnd(Enum):
+    LastStep = 0
+    NAN = 1
+
+
+class DFPNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
+                                            'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
+                                            'goal': InputEmbedderParameters(activation_function='leaky_relu')}
+
+        self.input_embedders_parameters['observation'].scheme = [
+            Conv2d([32, 8, 4]),
+            Conv2d([64, 4, 2]),
+            Conv2d([64, 3, 1]),
+            Dense([512]),
+        ]
+
+        self.input_embedders_parameters['measurements'].scheme = [
+            Dense([128]),
+            Dense([128]),
+            Dense([128]),
+        ]
+
+        self.input_embedders_parameters['goal'].scheme = [
+            Dense([128]),
+            Dense([128]),
+            Dense([128]),
+        ]
+
+        self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
+                                                            scheme=MiddlewareScheme.Empty)
+        self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
+        self.loss_weights = [1.0]
+        self.async_training = False
+        self.batch_size = 64
+        self.adam_optimizer_beta1 = 0.95
+
+
+class DFPMemoryParameters(EpisodicExperienceReplayParameters):
+    def __init__(self):
+        self.max_size = (MemoryGranularity.Transitions, 20000)
+        self.shared_memory = True
+        super().__init__()
+
+
+class DFPAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_predicted_steps_ahead = 6
+        self.goal_vector = [1.0, 1.0]
+        self.future_measurements_weights = [0.5, 0.5, 1.0]
+        self.use_accumulated_reward_as_measurement = False
+        self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
+        self.scale_measurements_targets = {}
+        self.num_consecutive_playing_steps = EnvironmentSteps(8)
+
+
+class DFPAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=DFPAlgorithmParameters(),
+                         exploration=EGreedyParameters(),
+                         memory=DFPMemoryParameters(),
+                         networks={"main": DFPNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.dfp_agent:DFPAgent'
+
+
+# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
+class DFPAgent(Agent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.current_goal = self.ap.algorithm.goal_vector
+        self.target_measurements_scale_factors = None
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        network_inputs = batch.states(network_keys)
+        network_inputs['goal'] = np.repeat(np.expand_dims(self.current_goal, 0),
+                                           self.ap.network_wrappers['main'].batch_size, axis=0)
+
+        # get the current outputs of the network
+        targets = self.networks['main'].online_network.predict(network_inputs)
+
+        # change the targets for the taken actions
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            targets[i, batch.actions()[i]] = batch[i].info['future_measurements'].flatten()
+
+        result = self.networks['main'].train_and_sync_networks(network_inputs, targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
+
+    def choose_action(self, curr_state):
+        if self.exploration_policy.requires_action_values():
+            # predict the future measurements
+            tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
+            tf_input_state['goal'] = np.expand_dims(self.current_goal, 0)
+            measurements_future_prediction = self.networks['main'].online_network.predict(tf_input_state)[0]
+            action_values = np.zeros(len(self.spaces.action.actions))
+            num_steps_used_for_objective = len(self.ap.algorithm.future_measurements_weights)
+
+            # calculate the score of each action by multiplying it's future measurements with the goal vector
+            for action_idx in range(len(self.spaces.action.actions)):
+                action_measurements = measurements_future_prediction[action_idx]
+                action_measurements = np.reshape(action_measurements,
+                                                 (self.ap.algorithm.num_predicted_steps_ahead,
+                                                  self.spaces.state['measurements'].shape[0]))
+                future_steps_values = np.dot(action_measurements, self.current_goal)
+                action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
+                                                   self.ap.algorithm.future_measurements_weights)
+        else:
+            action_values = None
+
+        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
+        action = self.exploration_policy.get_action(action_values)
+
+        if action_values is not None:
+            action_values = action_values.squeeze()
+            action_info = ActionInfo(action=action, action_value=action_values[action])
+        else:
+            action_info = ActionInfo(action=action)
+
+        return action_info
+
+    def set_environment_parameters(self, spaces: SpacesDefinition):
+        self.spaces = copy.deepcopy(spaces)
+        self.spaces.goal = VectorObservationSpace(shape=self.spaces.state['measurements'].shape,
+                                                  measurements_names=
+                                                  self.spaces.state['measurements'].measurements_names)
+
+        # if the user has filled some scale values, check that he got the names right
+        if set(self.spaces.state['measurements'].measurements_names).intersection(
+                self.ap.algorithm.scale_measurements_targets.keys()) !=\
+                set(self.ap.algorithm.scale_measurements_targets.keys()):
+            raise ValueError("Some of the keys in parameter scale_measurements_targets ({})  are not defined in "
+                             "the measurements space {}".format(self.ap.algorithm.scale_measurements_targets.keys(),
+                                                                self.spaces.state['measurements'].measurements_names))
+
+        super().set_environment_parameters(self.spaces)
+
+        # the below is done after calling the base class method, as it might add accumulated reward as a measurement
+
+        # fill out the missing measurements scale factors
+        for measurement_name in self.spaces.state['measurements'].measurements_names:
+            if measurement_name not in self.ap.algorithm.scale_measurements_targets:
+                self.ap.algorithm.scale_measurements_targets[measurement_name] = 1
+
+        self.target_measurements_scale_factors = \
+            np.array([self.ap.algorithm.scale_measurements_targets[measurement_name] for measurement_name in
+                      self.spaces.state['measurements'].measurements_names])
+
+    def handle_episode_ended(self):
+        last_episode = self.current_episode_buffer
+        if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP] and last_episode:
+            self._update_measurements_targets(last_episode,
+                                              self.ap.algorithm.num_predicted_steps_ahead)
+        super().handle_episode_ended()
+
+    def _update_measurements_targets(self, episode, num_steps):
+        if 'measurements' not in episode.transitions[0].state or episode.transitions[0].state['measurements'] == []:
+            raise ValueError("Measurements are not present in the transitions of the last episode played. ")
+        measurements_size = self.spaces.state['measurements'].shape[0]
+        for transition_idx, transition in enumerate(episode.transitions):
+            transition.info['future_measurements'] = np.zeros((num_steps, measurements_size))
+            for step in range(num_steps):
+                offset_idx = transition_idx + 2 ** step
+
+                if offset_idx >= episode.length():
+                    if self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.NAN:
+                        # the special MSE loss will ignore those entries so that the gradient will be 0 for these
+                        transition.info['future_measurements'][step] = np.nan
+                        continue
+
+                    elif self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.LastStep:
+                        offset_idx = - 1
+
+                transition.info['future_measurements'][step] = \
+                    self.target_measurements_scale_factors * \
+                    (episode.transitions[offset_idx].state['measurements'] - transition.state['measurements'])
--- a/rl_coach/agents/dqn_agent.py
+++ b/rl_coach/agents/dqn_agent.py
@@ -0,0 +1,99 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
+    InputEmbedderParameters, MiddlewareScheme
+from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
+from rl_coach.schedules import LinearSchedule
+
+from rl_coach.core_types import EnvironmentSteps
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+
+
+class DQNAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
+        self.num_consecutive_playing_steps = EnvironmentSteps(4)
+        self.discount = 0.99
+
+
+class DQNNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
+        self.heads_parameters = [QHeadParameters()]
+        self.loss_weights = [1.0]
+        self.optimizer_type = 'Adam'
+        self.batch_size = 32
+        self.replace_mse_with_huber_loss = True
+        self.create_target_network = True
+
+
+class DQNAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=DQNAlgorithmParameters(),
+                         exploration=EGreedyParameters(),
+                         memory=ExperienceReplayParameters(),
+                         networks={"main": DQNNetworkParameters()})
+        self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000)
+        self.exploration.evaluation_epsilon = 0.05
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.dqn_agent:DQNAgent'
+
+
+# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
+class DQNAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # for the action we actually took, the error is:
+        # TD error = r + discount*max(q_st_plus_1) - q_st
+        # # for all other actions, the error is 0
+        q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+
+        #  only update the action that we have actually done in this transition
+        TD_errors = []
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            new_target = batch.rewards()[i] +\
+                         (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
+            TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
+            TD_targets[i, batch.actions()[i]] = new_target
+
+        # update errors in prioritized replay buffer
+        importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
+
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
+                                                               importance_weights=importance_weights)
+
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
--- a/rl_coach/agents/hac_ddpg_agent.py
+++ b/rl_coach/agents/hac_ddpg_agent.py
@@ -0,0 +1,108 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+import copy
+
+from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
+from rl_coach.core_types import RunPhase
+from rl_coach.spaces import SpacesDefinition
+
+
+class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.time_limit = 40
+        self.sub_goal_testing_rate = 0.5
+
+
+class HACDDPGAgentParameters(DDPGAgentParameters):
+    def __init__(self):
+        super().__init__()
+        self.algorithm = HACDDPGAlgorithmParameters()
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.hac_ddpg_agent:HACDDPGAgent'
+
+
+# Hierarchical Actor Critic Generating Subgoals DDPG Agent - https://arxiv.org/pdf/1712.00948.pdf
+class HACDDPGAgent(DDPGAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
+        self.graph_manager = None
+
+    def choose_action(self, curr_state):
+        # top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
+        # testing phase
+
+        graph_manager = self.parent_level_manager.parent_graph_manager
+        if self.ap.is_a_highest_level_agent:
+            graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate
+
+        if self.phase == RunPhase.TRAIN:
+            if graph_manager.should_test_current_sub_goal:
+                self.exploration_policy.change_phase(RunPhase.TEST)
+            else:
+                self.exploration_policy.change_phase(self.phase)
+
+        action_info = super().choose_action(curr_state)
+        return action_info
+
+    def update_transition_before_adding_to_replay_buffer(self, transition):
+        graph_manager = self.parent_level_manager.parent_graph_manager
+
+        # deal with goals given from a higher level agent
+        if not self.ap.is_a_highest_level_agent:
+            transition.state['desired_goal'] = self.current_hrl_goal
+            transition.next_state['desired_goal'] = self.current_hrl_goal
+            # TODO: allow setting goals which are not part of the state. e.g. state-embedding using get_prediction
+            self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
+                self.current_hrl_goal, transition.next_state))
+            goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
+                self.current_hrl_goal, transition.next_state)
+            transition.reward = goal_reward
+            transition.game_over = transition.game_over or sub_goal_reached
+
+        # each level tests its own generated sub goals
+        if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
+            #TODO-fixme
+            # _, sub_goal_reached = self.parent_level_manager.environment.agents['agent_1'].spaces.goal.\
+            # get_reward_for_goal_and_state(transition.action, transition.next_state)
+
+            _, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
+                transition.action, transition.next_state)
+
+            sub_goal_is_missed = not sub_goal_reached
+
+            if sub_goal_is_missed:
+                    transition.reward = -self.ap.algorithm.time_limit
+        return transition
+
+    def set_environment_parameters(self, spaces: SpacesDefinition):
+        super().set_environment_parameters(spaces)
+
+        if self.ap.is_a_highest_level_agent:
+            # the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
+            # their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
+            self.spaces.goal = self.spaces.action
+            self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
+
+        if not self.ap.is_a_highest_level_agent:
+            self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward
--- a/rl_coach/agents/human_agent.py
+++ b/rl_coach/agents/human_agent.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from collections import OrderedDict
+from typing import Union
+
+import pygame
+from rl_coach.agents.agent import Agent
+from rl_coach.agents.bc_agent import BCNetworkParameters
+from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, InputEmbedderParameters, EmbedderScheme, \
+    AgentParameters
+from rl_coach.core_types import ActionInfo
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from pandas import to_pickle
+
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+from rl_coach.logger import screen
+
+
+class HumanAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+
+
+class HumanNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [PolicyHeadParameters()]
+        self.loss_weights = [1.0]
+        self.optimizer_type = 'Adam'
+        self.batch_size = 32
+        self.replace_mse_with_huber_loss = False
+        self.create_target_network = False
+
+
+class HumanAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=HumanAlgorithmParameters(),
+                         exploration=EGreedyParameters(),
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"main": BCNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.human_agent:HumanAgent'
+
+
+class HumanAgent(Agent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+        self.clock = pygame.time.Clock()
+        self.max_fps = int(self.ap.visualization.max_fps_for_human_control)
+        self.env = None
+
+    def init_environment_dependent_modules(self):
+        super().init_environment_dependent_modules()
+        self.env = self.parent_level_manager._real_environment
+        screen.log_title("Human Control Mode")
+        available_keys = self.env.get_available_keys()
+        if available_keys:
+            screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
+            screen.log("")
+            for action, key in self.env.get_available_keys():
+                screen.log("\t- {}: {}".format(action, key))
+            screen.separator()
+
+    def train(self):
+        return 0
+
+    def choose_action(self, curr_state):
+        action = ActionInfo(self.env.get_action_from_user(), action_value=0)
+        action = self.output_filter.reverse_filter(action)
+
+        # keep constant fps
+        self.clock.tick(self.max_fps)
+
+        if not self.env.renderer.is_open:
+            self.save_replay_buffer_and_exit()
+
+        return action
+
+    def save_replay_buffer_and_exit(self):
+        replay_buffer_path = os.path.join(self.agent_logger.experiments_path, 'replay_buffer.p')
+        self.memory.tp = None
+        to_pickle(self.memory, replay_buffer_path)
+        screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
+        exit()
+
+    def log_to_screen(self):
+        # log to screen
+        log = OrderedDict()
+        log["Episode"] = self.current_episode
+        log["Total reward"] = round(self.total_reward_in_current_episode, 2)
+        log["Steps"] = self.total_steps_counter
+        screen.log_dict(log, prefix="Recording")
--- a/rl_coach/agents/imitation_agent.py
+++ b/rl_coach/agents/imitation_agent.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import OrderedDict
+from typing import Union
+
+from rl_coach.core_types import RunPhase, ActionInfo
+from rl_coach.spaces import DiscreteActionSpace
+
+from rl_coach.agents.agent import Agent
+from rl_coach.logger import screen
+
+
+## This is an abstract agent - there is no learn_from_batch method ##
+
+# Imitation Agent
+class ImitationAgent(Agent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+        self.imitation = True
+
+    def extract_action_values(self, prediction):
+        return prediction.squeeze()
+
+    def choose_action(self, curr_state):
+        # convert to batch so we can run it through the network
+        prediction = self.networks['main'].online_network.predict(self.prepare_batch_for_inference(curr_state, 'main'))
+
+        # get action values and extract the best action from it
+        action_values = self.extract_action_values(prediction)
+        if type(self.spaces.action) == DiscreteActionSpace:
+            # DISCRETE
+            self.exploration_policy.phase = RunPhase.TEST
+            action = self.exploration_policy.get_action(action_values)
+
+            action_info = ActionInfo(action=action,
+                                     action_probability=action_values[action])
+        else:
+            # CONTINUOUS
+            action = action_values
+
+            action_info = ActionInfo(action=action)
+
+        return action_info
+
+    def log_to_screen(self):
+        # log to screen
+        if self.phase == RunPhase.TRAIN:
+            # for the training phase - we log during the episode to visualize the progress in training
+            log = OrderedDict()
+            if self.task_id is not None:
+                log["Worker"] = self.task_id
+            log["Episode"] = self.current_episode
+            log["Loss"] = self.loss.values[-1]
+            log["Training iteration"] = self.training_iteration
+            screen.log_dict(log, prefix="Training")
+        else:
+            # for the evaluation phase - logging as in regular RL
+            super().log_to_screen()
+
+    def learn_from_batch(self, batch):
+        raise NotImplementedError("ImitationAgent is an abstract agent. Not to be used directly.")
--- a/rl_coach/agents/mmc_agent.py
+++ b/rl_coach/agents/mmc_agent.py
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+
+from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+
+
+class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.monte_carlo_mixing_rate = 0.1
+
+
+class MixedMonteCarloAgentParameters(DQNAgentParameters):
+    def __init__(self):
+        super().__init__()
+        self.algorithm = MixedMonteCarloAlgorithmParameters()
+        self.memory = EpisodicExperienceReplayParameters()
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.mmc_agent:MixedMonteCarloAgent'
+
+
+class MixedMonteCarloAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # for the 1-step, we use the double-dqn target. hence actions are taken greedily according to the online network
+        selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
+
+        # TD_targets are initialized with the current prediction so that we will
+        #  only update the action that we have actually done in this transition
+        q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            one_step_target = batch.rewards()[i] + \
+                              (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
+                              q_st_plus_1[i][selected_actions[i]]
+            monte_carlo_target = batch.total_returns()[i]
+            TD_targets[i, batch.actions()[i]] = (1 - self.mixing_rate) * one_step_target + \
+                                                self.mixing_rate * monte_carlo_target
+
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
--- a/rl_coach/agents/n_step_q_agent.py
+++ b/rl_coach/agents/n_step_q_agent.py
@@ -0,0 +1,126 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
+    InputEmbedderParameters
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+from rl_coach.utils import last_sample
+
+from rl_coach.core_types import EnvironmentSteps
+from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
+
+
+class NStepQNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [QHeadParameters()]
+        self.loss_weights = [1.0]
+        self.optimizer_type = 'Adam'
+        self.async_training = True
+        self.shared_optimizer = True
+        self.create_target_network = True
+
+
+class NStepQAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
+        self.apply_gradients_every_x_episodes = 1
+        self.num_steps_between_gradient_updates = 5  # this is called t_max in all the papers
+        self.targets_horizon = 'N-Step'
+
+
+class NStepQAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=NStepQAlgorithmParameters(),
+                         exploration=EGreedyParameters(),
+                         memory=SingleEpisodeBufferParameters(),
+                         networks={"main": NStepQNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.n_step_q_agent:NStepQAgent'
+
+
+# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
+class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.last_gradient_update_step_idx = 0
+        self.q_values = self.register_signal('Q Values')
+        self.value_loss = self.register_signal('Value Loss')
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # get the values for the current states
+        state_value_head_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
+
+        # the targets for the state value estimator
+        if self.ap.algorithm.targets_horizon == '1-Step':
+            # 1-Step Q learning
+            q_st_plus_1 = self.networks['main'].target_network.predict(batch.next_states(network_keys))
+
+            for i in reversed(range(batch.size)):
+                state_value_head_targets[i][batch.actions()[i]] = \
+                    batch.rewards()[i] \
+                    + (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
+
+        elif self.ap.algorithm.targets_horizon == 'N-Step':
+            # N-Step Q learning
+            if batch.game_overs()[-1]:
+                R = 0
+            else:
+                R = np.max(self.networks['main'].target_network.predict(last_sample(batch.next_states(network_keys))))
+
+            for i in reversed(range(batch.size)):
+                R = batch.rewards()[i] + self.ap.algorithm.discount * R
+                state_value_head_targets[i][batch.actions()[i]] = R
+
+        else:
+            assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
+
+        # train
+        result = self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys), [state_value_head_targets])
+
+        # logging
+        total_loss, losses, unclipped_grads = result[:3]
+        self.value_loss.add_sample(losses[0])
+
+        return total_loss, losses, unclipped_grads
+
+    def train(self):
+        # update the target network of every network that has a target network
+        if any([network.has_target for network in self.networks.values()]) \
+                and self._should_update_online_weights_to_target():
+            for network in self.networks.values():
+                network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
+
+            self.agent_logger.create_signal_value('Update Target Network', 1)
+        else:
+            self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
+
+        return PolicyOptimizationAgent.train(self)
--- a/rl_coach/agents/naf_agent.py
+++ b/rl_coach/agents/naf_agent.py
@@ -0,0 +1,126 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.architectures.tensorflow_components.heads.naf_head import NAFHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, \
+    NetworkParameters, InputEmbedderParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.spaces import BoxActionSpace
+
+from rl_coach.core_types import ActionInfo, EnvironmentSteps
+from rl_coach.exploration_policies.ou_process import OUProcessParameters
+
+
+class NAFNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [NAFHeadParameters()]
+        self.loss_weights = [1.0]
+        self.optimizer_type = 'Adam'
+        self.learning_rate = 0.001
+        self.async_training = True
+        self.create_target_network = True
+
+
+class NAFAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.num_consecutive_training_steps = 5
+        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
+        self.rate_for_copying_weights_to_target = 0.001
+
+
+class NAFAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=NAFAlgorithmParameters(),
+                         exploration=OUProcessParameters(),
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"main": NAFNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.naf_agent:NAFAgent'
+
+
+# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
+class NAFAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.l_values = self.register_signal("L")
+        self.a_values = self.register_signal("Advantage")
+        self.mu_values = self.register_signal("Action")
+        self.v_values = self.register_signal("V")
+        self.TD_targets = self.register_signal("TD targets")
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # TD error = r + discount*v_st_plus_1 - q_st
+        v_st_plus_1 = self.networks['main'].target_network.predict(
+            batch.next_states(network_keys),
+            self.networks['main'].target_network.output_heads[0].V,
+            squeeze_output=False,
+        )
+        TD_targets = np.expand_dims(batch.rewards(), -1) + \
+                     (1.0 - np.expand_dims(batch.game_overs(), -1)) * self.ap.algorithm.discount * v_st_plus_1
+
+        self.TD_targets.add_sample(TD_targets)
+
+        result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
+                                                                'output_0_0': batch.actions(len(batch.actions().shape) == 1)
+                                                                }, TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
+
+    def choose_action(self, curr_state):
+        if type(self.spaces.action) != BoxActionSpace:
+            raise ValueError('NAF works only for continuous control problems')
+
+        # convert to batch so we can run it through the network
+        tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
+        naf_head = self.networks['main'].online_network.output_heads[0]
+        action_values = self.networks['main'].online_network.predict(tf_input_state, outputs=naf_head.mu,
+                                                                     squeeze_output=False)
+
+        # get the actual action to use
+        action = self.exploration_policy.get_action(action_values)
+
+        # get the internal values for logging
+        outputs = [naf_head.mu, naf_head.Q, naf_head.L, naf_head.A, naf_head.V]
+        result = self.networks['main'].online_network.predict(
+            {**tf_input_state, 'output_0_0': action_values},
+            outputs=outputs
+        )
+        mu, Q, L, A, V = result
+
+        # store the q values statistics for logging
+        self.q_values.add_sample(Q)
+        self.l_values.add_sample(L)
+        self.a_values.add_sample(A)
+        self.mu_values.add_sample(mu)
+        self.v_values.add_sample(V)
+
+        action_info = ActionInfo(action=action, action_value=Q)
+        
+        return action_info
--- a/rl_coach/agents/nec_agent.py
+++ b/rl_coach/agents/nec_agent.py
@@ -0,0 +1,176 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import pickle
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.architectures.tensorflow_components.heads.dnd_q_head import DNDQHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
+    InputEmbedderParameters
+from rl_coach.core_types import RunPhase, EnvironmentSteps, Episode, StateType
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters, MemoryGranularity
+from rl_coach.schedules import ConstantSchedule
+
+from rl_coach.exploration_policies.e_greedy import EGreedyParameters
+from rl_coach.logger import screen
+
+
+class NECNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [DNDQHeadParameters()]
+        self.loss_weights = [1.0]
+        self.rescale_gradient_from_head_by_factor = [1]
+        self.optimizer_type = 'Adam'
+
+
+class NECAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.dnd_size = 500000
+        self.l2_norm_added_delta = 0.001
+        self.new_value_shift_coefficient = 0.1
+        self.number_of_knn = 50
+        self.DND_key_error_threshold = 0
+        self.num_consecutive_playing_steps = EnvironmentSteps(4)
+        self.propagate_updates_to_DND = False
+        self.n_step = 100
+        self.bootstrap_total_return_from_old_policy = True
+
+
+class NECMemoryParameters(EpisodicExperienceReplayParameters):
+    def __init__(self):
+        super().__init__()
+        self.max_size = (MemoryGranularity.Transitions, 100000)
+
+
+class NECAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=NECAlgorithmParameters(),
+                         exploration=EGreedyParameters(),
+                         memory=NECMemoryParameters(),
+                         networks={"main": NECNetworkParameters()})
+        self.exploration.epsilon_schedule = ConstantSchedule(0.1)
+        self.exploration.evaluation_epsilon = 0.01
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.nec_agent:NECAgent'
+
+
+# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
+class NECAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.current_episode_state_embeddings = []
+        self.training_started = False
+        self.current_episode_buffer = \
+            Episode(discount=self.ap.algorithm.discount,
+                    n_step=self.ap.algorithm.n_step,
+                    bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
+
+    def learn_from_batch(self, batch):
+        if not self.networks['main'].online_network.output_heads[0].DND.has_enough_entries(self.ap.algorithm.number_of_knn):
+            return 0, [], 0
+        else:
+            if not self.training_started:
+                self.training_started = True
+                screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
+
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        TD_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
+
+        #  only update the action that we have actually done in this transition
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            TD_targets[i, batch.actions()[i]] = batch.total_returns()[i]
+
+        # set the gradients to fetch for the DND update
+        fetches = []
+        head = self.networks['main'].online_network.output_heads[0]
+        if self.ap.algorithm.propagate_updates_to_DND:
+            fetches = [head.dnd_embeddings_grad, head.dnd_values_grad, head.dnd_indices]
+
+        # train the neural network
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets, fetches)
+
+        total_loss, losses, unclipped_grads = result[:3]
+
+        # update the DND keys and values using the extracted gradients
+        if self.ap.algorithm.propagate_updates_to_DND:
+            embedding_gradients = np.swapaxes(result[-1][0], 0, 1)
+            value_gradients = np.swapaxes(result[-1][1], 0, 1)
+            indices = np.swapaxes(result[-1][2], 0, 1)
+            head.DND.update_keys_and_values(batch.actions(), embedding_gradients, value_gradients, indices)
+
+        return total_loss, losses, unclipped_grads
+
+    def act(self):
+        if self.phase == RunPhase.HEATUP:
+            # get embedding in heatup (otherwise we get it through get_prediction)
+            embedding = self.networks['main'].online_network.predict(
+                self.prepare_batch_for_inference(self.curr_state, 'main'),
+                outputs=self.networks['main'].online_network.state_embedding)
+            self.current_episode_state_embeddings.append(embedding)
+
+        return super().act()
+
+    def get_all_q_values_for_states(self, states: StateType):
+        # we need to store the state embeddings regardless if the action is random or not
+        return self.get_prediction(states)
+
+    def get_prediction(self, states):
+        # get the actions q values and the state embedding
+        embedding, actions_q_values = self.networks['main'].online_network.predict(
+            self.prepare_batch_for_inference(states, 'main'),
+            outputs=[self.networks['main'].online_network.state_embedding,
+                     self.networks['main'].online_network.output_heads[0].output]
+        )
+        if self.phase != RunPhase.TEST:
+            # store the state embedding for inserting it to the DND later
+            self.current_episode_state_embeddings.append(embedding.squeeze())
+        actions_q_values = actions_q_values[0][0]
+        return actions_q_values
+
+    def reset_internal_state(self):
+        super().reset_internal_state()
+        self.current_episode_state_embeddings = []
+        self.current_episode_buffer = \
+            Episode(discount=self.ap.algorithm.discount,
+                    n_step=self.ap.algorithm.n_step,
+                    bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
+
+    def handle_episode_ended(self):
+        super().handle_episode_ended()
+
+        # get the last full episode that we have collected
+        episode = self.call_memory('get_last_complete_episode')
+        if episode is not None and self.phase != RunPhase.TEST:
+            assert len(self.current_episode_state_embeddings) == episode.length()
+            returns = episode.get_transitions_attribute('total_return')
+            actions = episode.get_transitions_attribute('action')
+            self.networks['main'].online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
+                                                                         actions, returns)
+
+    def save_checkpoint(self, checkpoint_id):
+        with open(os.path.join(self.ap.task_parameters.save_checkpoint_dir, str(checkpoint_id) + '.dnd'), 'wb') as f:
+            pickle.dump(self.networks['main'].online_network.output_heads[0].DND, f, pickle.HIGHEST_PROTOCOL)
--- a/rl_coach/agents/pal_agent.py
+++ b/rl_coach/agents/pal_agent.py
@@ -0,0 +1,94 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+
+from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay, \
+    EpisodicExperienceReplayParameters
+
+
+class PALAlgorithmParameters(DQNAlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.pal_alpha = 0.9
+        self.persistent_advantage_learning = False
+        self.monte_carlo_mixing_rate = 0.1
+
+
+class PALAgentParameters(DQNAgentParameters):
+    def __init__(self):
+        super().__init__()
+        self.algorithm = PALAlgorithmParameters()
+        self.memory = EpisodicExperienceReplayParameters()
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.pal_agent:PALAgent'
+
+
+# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
+class PALAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.alpha = agent_parameters.algorithm.pal_alpha
+        self.persistent = agent_parameters.algorithm.persistent_advantage_learning
+        self.monte_carlo_mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # next state values
+        q_st_plus_1_target, q_st_plus_1_online = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.next_states(network_keys))
+        ])
+        selected_actions = np.argmax(q_st_plus_1_online, 1)
+        v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
+
+        # current state values
+        q_st_target, q_st_online = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+        v_st_target = np.max(q_st_target, 1)
+
+        # calculate TD error
+        TD_targets = np.copy(q_st_online)
+        for i in range(self.ap.network_wrappers['main'].batch_size):
+            TD_targets[i, batch.actions()[i]] = batch.rewards()[i] + \
+                                        (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
+                                                     q_st_plus_1_target[i][selected_actions[i]]
+            advantage_learning_update = v_st_target[i] - q_st_target[i, batch.actions()[i]]
+            next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
+            # Persistent Advantage Learning or Regular Advantage Learning
+            if self.persistent:
+                TD_targets[i, batch.actions()[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
+            else:
+                TD_targets[i, batch.actions()[i]] -= self.alpha * advantage_learning_update
+
+            # mixing monte carlo updates
+            monte_carlo_target = batch.total_returns()[i]
+            TD_targets[i, batch.actions()[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, batch.actions()[i]] \
+                                        + self.monte_carlo_mixing_rate * monte_carlo_target
+
+        result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
--- a/rl_coach/agents/policy_gradients_agent.py
+++ b/rl_coach/agents/policy_gradients_agent.py
@@ -0,0 +1,105 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
+from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
+    AgentParameters, InputEmbedderParameters
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.spaces import DiscreteActionSpace
+
+from rl_coach.logger import screen
+from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
+
+
+class PolicyGradientNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
+        self.middleware_parameters = FCMiddlewareParameters()
+        self.heads_parameters = [PolicyHeadParameters()]
+        self.loss_weights = [1.0]
+        self.async_training = True
+
+
+class PolicyGradientAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
+        self.apply_gradients_every_x_episodes = 5
+        self.beta_entropy = 0
+        self.num_steps_between_gradient_updates = 20000  # this is called t_max in all the papers
+
+
+class PolicyGradientsAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
+                         exploration=AdditiveNoiseParameters(),
+                         memory=SingleEpisodeBufferParameters(),
+                         networks={"main": PolicyGradientNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.policy_gradients_agent:PolicyGradientsAgent'
+
+
+class PolicyGradientsAgent(PolicyOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.returns_mean = self.register_signal('Returns Mean')
+        self.returns_variance = self.register_signal('Returns Variance')
+        self.last_gradient_update_step_idx = 0
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        total_returns = batch.total_returns()
+        for i in reversed(range(batch.size)):
+            if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
+                total_returns[i] = total_returns[0]
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
+                # just take the total return as it is
+                pass
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
+                # we can get a single transition episode while playing Doom Basic, causing the std to be 0
+                if self.std_discounted_return != 0:
+                    total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
+                else:
+                    total_returns[i] = 0
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
+                total_returns[i] -= self.mean_return_over_multiple_episodes[i]
+            else:
+                screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        targets = total_returns
+        actions = batch.actions()
+        if type(self.spaces.action) != DiscreteActionSpace and len(actions.shape) < 2:
+            actions = np.expand_dims(actions, -1)
+
+        self.returns_mean.add_sample(np.mean(total_returns))
+        self.returns_variance.add_sample(np.std(total_returns))
+
+        result = self.networks['main'].online_network.accumulate_gradients(
+            {**batch.states(network_keys), 'output_0_0': actions}, targets
+        )
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
--- a/rl_coach/agents/policy_optimization_agent.py
+++ b/rl_coach/agents/policy_optimization_agent.py
@@ -0,0 +1,166 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import OrderedDict
+from enum import Enum
+from typing import Union
+
+import numpy as np
+from rl_coach.core_types import Batch, ActionInfo
+from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
+from rl_coach.utils import eps
+
+from rl_coach.agents.agent import Agent
+from rl_coach.logger import screen
+
+
+class PolicyGradientRescaler(Enum):
+    TOTAL_RETURN = 0
+    FUTURE_RETURN = 1
+    FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
+    FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3  # baselined
+    Q_VALUE = 4
+    A_VALUE = 5
+    TD_RESIDUAL = 6
+    DISCOUNTED_TD_RESIDUAL = 7
+    GAE = 8
+
+
+## This is an abstract agent - there is no learn_from_batch method ##
+
+
+class PolicyOptimizationAgent(Agent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+        self.policy_gradient_rescaler = None
+        if hasattr(self.ap.algorithm, 'policy_gradient_rescaler'):
+            self.policy_gradient_rescaler = self.ap.algorithm.policy_gradient_rescaler
+
+        # statistics for variance reduction
+        self.last_gradient_update_step_idx = 0
+        self.max_episode_length = 100000
+        self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
+        self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
+        self.entropy = self.register_signal('Entropy')
+
+    def log_to_screen(self):
+        # log to screen
+        log = OrderedDict()
+        log["Name"] = self.full_name_id
+        if self.task_id is not None:
+            log["Worker"] = self.task_id
+        log["Episode"] = self.current_episode
+        log["Total reward"] = round(self.total_reward_in_current_episode, 2)
+        log["Steps"] = self.total_steps_counter
+        log["Training iteration"] = self.training_iteration
+        screen.log_dict(log, prefix=self.phase.value)
+
+    def update_episode_statistics(self, episode):
+        episode_discounted_returns = []
+        for i in range(episode.length()):
+            transition = episode.get_transition(i)
+            episode_discounted_returns.append(transition.total_return)
+            self.num_episodes_where_step_has_been_seen[i] += 1
+            self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
+                                                          self.num_episodes_where_step_has_been_seen[i]
+            self.mean_return_over_multiple_episodes[i] += transition.total_return / \
+                                                          self.num_episodes_where_step_has_been_seen[i]
+        self.mean_discounted_return = np.mean(episode_discounted_returns)
+        self.std_discounted_return = np.std(episode_discounted_returns)
+
+    def get_current_episode(self):
+        # we get the episode most of the time from the current episode buffer and only in the last transition from the
+        # "memory" (where is was stored in the end of the episode)
+        return self.memory.get_episode(0) or self.current_episode_buffer
+
+    def train(self):
+        episode = self.get_current_episode()
+
+        # check if we should calculate gradients or skip
+        episode_ended = episode.is_complete
+        num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
+        is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
+        if not (is_t_max_steps_passed or episode_ended):
+            return 0
+
+        total_loss = 0
+        if num_steps_passed_since_last_update > 0:
+
+            # we need to update the returns of the episode until now
+            episode.update_returns()
+
+            # get t_max transitions or less if the we got to a terminal state
+            # will be used for both actor-critic and vanilla PG.
+            # # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
+            transitions = []
+            start_idx = self.last_gradient_update_step_idx
+            end_idx = episode.length()
+
+            for idx in range(start_idx, end_idx):
+                transitions.append(episode.get_transition(idx))
+            self.last_gradient_update_step_idx = end_idx
+
+            # update the statistics for the variance reduction techniques
+            if self.policy_gradient_rescaler in \
+                    [PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
+                     PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
+                self.update_episode_statistics(episode)
+
+            # accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
+            batch = Batch(transitions)
+            total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
+            if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
+                for network in self.networks.values():
+                    network.apply_gradients_and_sync_networks()
+            self.training_iteration += 1
+
+        # move the pointer to the next episode start and discard the episode.
+        if episode_ended:
+            # we need to remove the episode, because the next training iteration will be called before storing any
+            # additional transitions in the memory (we don't store a transition for the first call to observe), so the
+            # length of the memory won't be enforced and the old episode won't be removed
+            self.call_memory('remove_episode', 0)
+            self.last_gradient_update_step_idx = 0
+
+        return total_loss
+
+    def learn_from_batch(self, batch):
+        raise NotImplementedError("PolicyOptimizationAgent is an abstract agent. Not to be used directly.")
+
+    def get_prediction(self, states):
+        tf_input_state = self.prepare_batch_for_inference(states, "main")
+        return self.networks['main'].online_network.predict(tf_input_state)
+
+    def choose_action(self, curr_state):
+        # convert to batch so we can run it through the network
+        action_values = self.get_prediction(curr_state)
+        if isinstance(self.spaces.action, DiscreteActionSpace):
+            # DISCRETE
+            action_probabilities = np.array(action_values).squeeze()
+            action = self.exploration_policy.get_action(action_probabilities)
+            action_info = ActionInfo(action=action,
+                                     action_probability=action_probabilities[action])
+
+            self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
+        elif isinstance(self.spaces.action, BoxActionSpace):
+            # CONTINUOUS
+            action = self.exploration_policy.get_action(action_values)
+
+            action_info = ActionInfo(action=action)
+        else:
+            raise ValueError("The action space of the environment is not compatible with the algorithm")
+        return action_info
--- a/rl_coach/agents/ppo_agent.py
+++ b/rl_coach/agents/ppo_agent.py
@@ -0,0 +1,338 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from collections import OrderedDict
+from typing import Union
+
+import numpy as np
+from rl_coach.agents.actor_critic_agent import ActorCriticAgent
+from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
+from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
+    AgentParameters, InputEmbedderParameters, DistributedTaskParameters
+from rl_coach.core_types import EnvironmentSteps, Batch
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.spaces import DiscreteActionSpace
+from rl_coach.utils import force_list
+
+from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
+from rl_coach.logger import screen
+
+
+class PPOCriticNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
+        self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
+        self.heads_parameters = [VHeadParameters()]
+        self.loss_weights = [1.0]
+        self.async_training = True
+        self.l2_regularization = 0
+        self.create_target_network = True
+        self.batch_size = 128
+
+
+class PPOActorNetworkParameters(NetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
+        self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
+        self.heads_parameters = [PPOHeadParameters()]
+        self.optimizer_type = 'Adam'
+        self.loss_weights = [1.0]
+        self.async_training = True
+        self.l2_regularization = 0
+        self.create_target_network = True
+        self.batch_size = 128
+
+
+class PPOAlgorithmParameters(AlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
+        self.gae_lambda = 0.96
+        self.target_kl_divergence = 0.01
+        self.initial_kl_coefficient = 1.0
+        self.high_kl_penalty_coefficient = 1000
+        self.clip_likelihood_ratio_using_epsilon = None
+        self.value_targets_mix_fraction = 0.1
+        self.estimate_state_value_using_gae = True
+        self.step_until_collecting_full_episodes = True
+        self.use_kl_regularization = True
+        self.beta_entropy = 0.01
+        self.num_consecutive_playing_steps = EnvironmentSteps(5000)
+
+
+class PPOAgentParameters(AgentParameters):
+    def __init__(self):
+        super().__init__(algorithm=PPOAlgorithmParameters(),
+                         exploration=AdditiveNoiseParameters(),
+                         memory=EpisodicExperienceReplayParameters(),
+                         networks={"critic": PPOCriticNetworkParameters(), "actor": PPOActorNetworkParameters()})
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.ppo_agent:PPOAgent'
+
+
+# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
+class PPOAgent(ActorCriticAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+
+        # signals definition
+        self.value_loss = self.register_signal('Value Loss')
+        self.policy_loss = self.register_signal('Policy Loss')
+        self.kl_divergence = self.register_signal('KL Divergence')
+        self.total_kl_divergence_during_training_process = 0.0
+        self.unclipped_grads = self.register_signal('Grads (unclipped)')
+
+    def fill_advantages(self, batch):
+        batch = Batch(batch)
+        network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
+
+        # * Found not to have any impact *
+        # current_states_with_timestep = self.concat_state_and_timestep(batch)
+
+        current_state_values = self.networks['critic'].online_network.predict(batch.states(network_keys)).squeeze()
+
+        # calculate advantages
+        advantages = []
+        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
+            advantages = batch.total_returns() - current_state_values
+        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
+            # get bootstraps
+            episode_start_idx = 0
+            advantages = np.array([])
+            # current_state_values[batch.game_overs()] = 0
+            for idx, game_over in enumerate(batch.game_overs()):
+                if game_over:
+                    # get advantages for the rollout
+                    value_bootstrapping = np.zeros((1,))
+                    rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
+
+                    rollout_advantages, _ = \
+                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
+                                                                     rollout_state_values)
+                    episode_start_idx = idx + 1
+                    advantages = np.append(advantages, rollout_advantages)
+        else:
+            screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        # standardize
+        advantages = (advantages - np.mean(advantages)) / np.std(advantages)
+
+        # TODO: this will be problematic with a shared memory
+        for transition, advantage in zip(self.memory.transitions, advantages):
+            transition.info['advantage'] = advantage
+
+        self.action_advantages.add_sample(advantages)
+
+    def train_value_network(self, dataset, epochs):
+        loss = []
+        batch = Batch(dataset)
+        network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
+
+        # * Found not to have any impact *
+        # add a timestep to the observation
+        # current_states_with_timestep = self.concat_state_and_timestep(dataset)
+
+        mix_fraction = self.ap.algorithm.value_targets_mix_fraction
+        for j in range(epochs):
+            curr_batch_size = batch.size
+            if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
+                curr_batch_size = self.ap.network_wrappers['critic'].batch_size
+            for i in range(batch.size // curr_batch_size):
+                # split to batches for first order optimization techniques
+                current_states_batch = {
+                    k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
+                    for k, v in batch.states(network_keys).items()
+                }
+                total_return_batch = batch.total_returns(True)[i * curr_batch_size:(i + 1) * curr_batch_size]
+                old_policy_values = force_list(self.networks['critic'].target_network.predict(
+                    current_states_batch).squeeze())
+                if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
+                    targets = total_return_batch
+                else:
+                    current_values = self.networks['critic'].online_network.predict(current_states_batch)
+                    targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
+
+                inputs = copy.copy(current_states_batch)
+                for input_index, input in enumerate(old_policy_values):
+                    name = 'output_0_{}'.format(input_index)
+                    if name in self.networks['critic'].online_network.inputs:
+                        inputs[name] = input
+
+                value_loss = self.networks['critic'].online_network.accumulate_gradients(inputs, targets)
+
+                self.networks['critic'].apply_gradients_to_online_network()
+                if isinstance(self.ap.task_parameters, DistributedTaskParameters):
+                    self.networks['critic'].apply_gradients_to_global_network()
+                self.networks['critic'].online_network.reset_accumulated_gradients()
+
+                loss.append([value_loss[0]])
+        loss = np.mean(loss, 0)
+        return loss
+
+    def concat_state_and_timestep(self, dataset):
+        current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
+                                        for transition in dataset]
+        current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
+        return current_states_with_timestep
+
+    def train_policy_network(self, dataset, epochs):
+        loss = []
+        for j in range(epochs):
+            loss = {
+                'total_loss': [],
+                'policy_losses': [],
+                'unclipped_grads': [],
+                'fetch_result': []
+            }
+            #shuffle(dataset)
+            for i in range(len(dataset) // self.ap.network_wrappers['actor'].batch_size):
+                batch = Batch(dataset[i * self.ap.network_wrappers['actor'].batch_size:
+                                      (i + 1) * self.ap.network_wrappers['actor'].batch_size])
+
+                network_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
+
+                advantages = batch.info('advantage')
+                actions = batch.actions()
+                if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
+                    actions = np.expand_dims(actions, -1)
+
+                # get old policy probabilities and distribution
+                old_policy = force_list(self.networks['actor'].target_network.predict(batch.states(network_keys)))
+
+                # calculate gradients and apply on both the local policy network and on the global policy network
+                fetches = [self.networks['actor'].online_network.output_heads[0].kl_divergence,
+                           self.networks['actor'].online_network.output_heads[0].entropy]
+
+                inputs = copy.copy(batch.states(network_keys))
+                inputs['output_0_0'] = actions
+
+                # old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
+                # it has just a mean. otherwise, it has both a mean and standard deviation
+                for input_index, input in enumerate(old_policy):
+                    inputs['output_0_{}'.format(input_index + 1)] = input
+
+                total_loss, policy_losses, unclipped_grads, fetch_result =\
+                    self.networks['actor'].online_network.accumulate_gradients(
+                        inputs, [advantages], additional_fetches=fetches)
+
+                self.networks['actor'].apply_gradients_to_online_network()
+                if isinstance(self.ap.task_parameters, DistributedTaskParameters):
+                    self.networks['actor'].apply_gradients_to_global_network()
+
+                self.networks['actor'].online_network.reset_accumulated_gradients()
+
+                loss['total_loss'].append(total_loss)
+                loss['policy_losses'].append(policy_losses)
+                loss['unclipped_grads'].append(unclipped_grads)
+                loss['fetch_result'].append(fetch_result)
+
+                self.unclipped_grads.add_sample(unclipped_grads)
+
+            for key in loss.keys():
+                loss[key] = np.mean(loss[key], 0)
+
+            if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
+                curr_learning_rate = self.networks['critic'].online_network.get_variable_value(self.ap.learning_rate)
+                self.curr_learning_rate.add_sample(curr_learning_rate)
+            else:
+                curr_learning_rate = self.ap.network_wrappers['critic'].learning_rate
+
+            # log training parameters
+            screen.log_dict(
+                OrderedDict([
+                    ("Surrogate loss", loss['policy_losses'][0]),
+                    ("KL divergence", loss['fetch_result'][0]),
+                    ("Entropy", loss['fetch_result'][1]),
+                    ("training epoch", j),
+                    ("learning_rate", curr_learning_rate)
+                ]),
+                prefix="Policy training"
+            )
+
+        self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
+        self.entropy.add_sample(loss['fetch_result'][1])
+        self.kl_divergence.add_sample(loss['fetch_result'][0])
+        return loss['total_loss']
+
+    def update_kl_coefficient(self):
+        # John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
+        # his implementation for now because we know it works well
+        screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
+
+        # update kl coefficient
+        kl_target = self.ap.algorithm.target_kl_divergence
+        kl_coefficient = self.networks['actor'].online_network.get_variable_value(
+            self.networks['actor'].online_network.output_heads[0].kl_coefficient)
+        new_kl_coefficient = kl_coefficient
+        if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
+            # kl too high => increase regularization
+            new_kl_coefficient *= 1.5
+        elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
+            # kl too low => decrease regularization
+            new_kl_coefficient /= 1.5
+
+        # update the kl coefficient variable
+        if kl_coefficient != new_kl_coefficient:
+            self.networks['actor'].online_network.set_variable_value(
+                self.networks['actor'].online_network.output_heads[0].assign_kl_coefficient,
+                new_kl_coefficient,
+                self.networks['actor'].online_network.output_heads[0].kl_coefficient_ph)
+
+        screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
+
+    def post_training_commands(self):
+        if self.ap.algorithm.use_kl_regularization:
+            self.update_kl_coefficient()
+
+        # clean memory
+        self.call_memory('clean')
+
+    def train(self):
+        loss = 0
+        if self._should_train(wait_for_full_episode=True):
+            for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
+                self.networks['actor'].sync()
+                self.networks['critic'].sync()
+
+                dataset = self.memory.transitions
+
+                self.fill_advantages(dataset)
+
+                # take only the requested number of steps
+                dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
+
+                value_loss = self.train_value_network(dataset, 1)
+                policy_loss = self.train_policy_network(dataset, 10)
+
+                self.value_loss.add_sample(value_loss)
+                self.policy_loss.add_sample(policy_loss)
+
+            self.post_training_commands()
+            self.training_iteration += 1
+            self.update_log()  # should be done in order to update the data that has been accumulated * while not playing *
+            return np.append(value_loss, policy_loss)
+
+    def get_prediction(self, states):
+        tf_input_state = self.prepare_batch_for_inference(states, "actor")
+        return self.networks['actor'].online_network.predict(tf_input_state)
--- a/rl_coach/agents/qr_dqn_agent.py
+++ b/rl_coach/agents/qr_dqn_agent.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.architectures.tensorflow_components.heads.quantile_regression_q_head import QuantileRegressionQHeadParameters
+from rl_coach.schedules import LinearSchedule
+
+from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters, DQNAlgorithmParameters
+from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
+from rl_coach.core_types import StateType
+
+
+class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
+    def __init__(self):
+        super().__init__()
+        self.heads_parameters = [QuantileRegressionQHeadParameters()]
+        self.learning_rate = 0.00005
+        self.optimizer_epsilon = 0.01 / 32
+
+
+class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
+    def __init__(self):
+        super().__init__()
+        self.atoms = 200
+        self.huber_loss_interval = 1  # called k in the paper
+
+
+class QuantileRegressionDQNAgentParameters(DQNAgentParameters):
+    def __init__(self):
+        super().__init__()
+        self.algorithm = QuantileRegressionDQNAlgorithmParameters()
+        self.network_wrappers = {"main": QuantileRegressionDQNNetworkParameters()}
+        self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
+        self.exploration.evaluation_epsilon = 0.001
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.qr_dqn_agent:QuantileRegressionDQNAgent'
+
+
+# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
+class QuantileRegressionDQNAgent(ValueOptimizationAgent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.quantile_probabilities = np.ones(self.ap.algorithm.atoms) / float(self.ap.algorithm.atoms)
+
+    def get_q_values(self, quantile_values):
+        return np.dot(quantile_values, self.quantile_probabilities)
+
+    # prediction's format is (batch,actions,atoms)
+    def get_all_q_values_for_states(self, states: StateType):
+        if self.exploration_policy.requires_action_values():
+            quantile_values = self.get_prediction(states)
+            actions_q_values = self.get_q_values(quantile_values)
+        else:
+            actions_q_values = None
+        return actions_q_values
+
+    def learn_from_batch(self, batch):
+        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
+
+        # get the quantiles of the next states and current states
+        next_state_quantiles, current_quantiles = self.networks['main'].parallel_prediction([
+            (self.networks['main'].target_network, batch.next_states(network_keys)),
+            (self.networks['main'].online_network, batch.states(network_keys))
+        ])
+
+        # get the optimal actions to take for the next states
+        target_actions = np.argmax(self.get_q_values(next_state_quantiles), axis=1)
+
+        # calculate the Bellman update
+        batch_idx = list(range(self.ap.network_wrappers['main'].batch_size))
+
+        TD_targets = batch.rewards(True) + (1.0 - batch.game_overs(True)) * self.ap.algorithm.discount \
+                               * next_state_quantiles[batch_idx, target_actions]
+
+        # get the locations of the selected actions within the batch for indexing purposes
+        actions_locations = [[b, a] for b, a in zip(batch_idx, batch.actions())]
+
+        # calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order
+        cumulative_probabilities = np.array(range(self.ap.algorithm.atoms + 1)) / float(self.ap.algorithm.atoms) # tau_i
+        quantile_midpoints = 0.5*(cumulative_probabilities[1:] + cumulative_probabilities[:-1])  # tau^hat_i
+        quantile_midpoints = np.tile(quantile_midpoints, (self.ap.network_wrappers['main'].batch_size, 1))
+        sorted_quantiles = np.argsort(current_quantiles[batch_idx, batch.actions()])
+        for idx in range(self.ap.network_wrappers['main'].batch_size):
+            quantile_midpoints[idx, :] = quantile_midpoints[idx, sorted_quantiles[idx]]
+
+        # train
+        result = self.networks['main'].train_and_sync_networks({
+            **batch.states(network_keys),
+            'output_0_0': actions_locations,
+            'output_0_1': quantile_midpoints,
+        }, TD_targets)
+        total_loss, losses, unclipped_grads = result[:3]
+
+        return total_loss, losses, unclipped_grads
+
--- a/rl_coach/agents/value_optimization_agent.py
+++ b/rl_coach/agents/value_optimization_agent.py
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import numpy as np
+from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay
+from rl_coach.spaces import DiscreteActionSpace
+
+from rl_coach.agents.agent import Agent
+from rl_coach.core_types import ActionInfo, StateType
+
+
+## This is an abstract agent - there is no learn_from_batch method ##
+
+
+class ValueOptimizationAgent(Agent):
+    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+        super().__init__(agent_parameters, parent)
+        self.q_values = self.register_signal("Q")
+        self.q_value_for_action = {}
+
+    def init_environment_dependent_modules(self):
+        super().init_environment_dependent_modules()
+        if isinstance(self.spaces.action, DiscreteActionSpace):
+            for i in range(len(self.spaces.action.actions)):
+                self.q_value_for_action[i] = self.register_signal("Q for action {}".format(i),
+                                                                  dump_one_value_per_episode=False,
+                                                                  dump_one_value_per_step=True)
+
+    # Algorithms for which q_values are calculated from predictions will override this function
+    def get_all_q_values_for_states(self, states: StateType):
+        if self.exploration_policy.requires_action_values():
+            actions_q_values = self.get_prediction(states)
+        else:
+            actions_q_values = None
+        return actions_q_values
+
+    def get_prediction(self, states):
+        return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'))
+
+    def update_transition_priorities_and_get_weights(self, TD_errors, batch):
+        # update errors in prioritized replay buffer
+        importance_weights = None
+        if isinstance(self.memory, PrioritizedExperienceReplay):
+            self.call_memory('update_priorities', (batch.info('idx'), TD_errors))
+            importance_weights = batch.info('weight')
+        return importance_weights
+
+    def _validate_action(self, policy, action):
+        if np.array(action).shape != ():
+            raise ValueError((
+                'The exploration_policy {} returned a vector of actions '
+                'instead of a single action. ValueOptimizationAgents '
+                'require exploration policies which return a single action.'
+            ).format(policy.__class__.__name__))
+
+    def choose_action(self, curr_state):
+        actions_q_values = self.get_all_q_values_for_states(curr_state)
+
+        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
+        action = self.exploration_policy.get_action(actions_q_values)
+        self._validate_action(self.exploration_policy, action)
+
+        if actions_q_values is not None:
+            # this is for bootstrapped dqn
+            if type(actions_q_values) == list and len(actions_q_values) > 0:
+                actions_q_values = self.exploration_policy.last_action_values
+            actions_q_values = actions_q_values.squeeze()
+
+            # store the q values statistics for logging
+            self.q_values.add_sample(actions_q_values)
+            for i, q_value in enumerate(actions_q_values):
+                self.q_value_for_action[i].add_sample(q_value)
+
+            action_info = ActionInfo(action=action,
+                                     action_value=actions_q_values[action],
+                                     max_action_value=np.max(actions_q_values))
+        else:
+            action_info = ActionInfo(action=action)
+
+        return action_info
+
+    def learn_from_batch(self, batch):
+        raise NotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")
--- a/rl_coach/architectures/init.py
+++ b/rl_coach/architectures/init.py
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/rl_coach/architectures/architecture.py
+++ b/rl_coach/architectures/architecture.py
@@ -0,0 +1,71 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+
+class Architecture(object):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= ""):
+        """
+        :param agent_parameters: the agent parameters
+        :param spaces: the spaces (observation, action, etc.) definition of the agent
+        :param name: the name of the network
+        """
+        # spaces
+        self.spaces = spaces
+
+        self.name = name
+        self.network_wrapper_name = self.name.split('/')[0]  # the name can be main/online and the network_wrapper_name will be main
+        self.full_name = "{}/{}".format(agent_parameters.full_name_id, name)
+        self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
+        self.batch_size = self.network_parameters.batch_size
+        self.learning_rate = self.network_parameters.learning_rate
+        self.optimizer = None
+        self.ap = agent_parameters
+
+    def get_model(self):
+        pass
+
+    def predict(self, inputs):
+        pass
+
+    def train_on_batch(self, inputs, targets):
+        pass
+
+    def get_weights(self):
+        pass
+
+    def set_weights(self, weights, rate=1.0):
+        pass
+
+    def reset_accumulated_gradients(self):
+        pass
+
+    def accumulate_gradients(self, inputs, targets):
+        pass
+
+    def apply_and_reset_gradients(self, gradients):
+        pass
+
+    def apply_gradients(self, gradients):
+        pass
+
+    def get_variable_value(self, variable):
+        pass
+
+    def set_variable_value(self, assign_op, value, placeholder=None):
+        pass
--- a/rl_coach/architectures/network_wrapper.py
+++ b/rl_coach/architectures/network_wrapper.py
@@ -0,0 +1,210 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Tuple
+
+from rl_coach.base_parameters import Frameworks, AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.logger import failed_imports
+
+try:
+    import tensorflow as tf
+    from rl_coach.architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork
+except ImportError:
+    failed_imports.append("TensorFlow")
+
+
+class NetworkWrapper(object):
+    """
+    Contains multiple networks and managers syncing and gradient updates
+    between them.
+    """
+    def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
+                 spaces: SpacesDefinition, replicated_device=None, worker_device=None):
+        self.ap = agent_parameters
+        self.network_parameters = self.ap.network_wrappers[name]
+        self.has_target = has_target
+        self.has_global = has_global
+        self.name = name
+        self.sess = None
+
+        if self.network_parameters.framework == Frameworks.tensorflow:
+            general_network = GeneralTensorFlowNetwork
+        else:
+            raise Exception("{} Framework is not supported"
+                            .format(Frameworks().to_string(self.network_parameters.framework)))
+
+        with tf.variable_scope("{}/{}".format(self.ap.full_name_id, name)):
+
+            # Global network - the main network shared between threads
+            self.global_network = None
+            if self.has_global:
+                # we assign the parameters of this network on the parameters server
+                with tf.device(replicated_device):
+                    self.global_network = general_network(agent_parameters=agent_parameters,
+                                                          name='{}/global'.format(name),
+                                                          global_network=None,
+                                                          network_is_local=False,
+                                                          spaces=spaces,
+                                                          network_is_trainable=True)
+
+            # Online network - local copy of the main network used for playing
+            self.online_network = None
+            with tf.device(worker_device):
+                self.online_network = general_network(agent_parameters=agent_parameters,
+                                                      name='{}/online'.format(name),
+                                                      global_network=self.global_network,
+                                                      network_is_local=True,
+                                                      spaces=spaces,
+                                                      network_is_trainable=True)
+
+            # Target network - a local, slow updating network used for stabilizing the learning
+            self.target_network = None
+            if self.has_target:
+                with tf.device(worker_device):
+                    self.target_network = general_network(agent_parameters=agent_parameters,
+                                                          name='{}/target'.format(name),
+                                                          global_network=self.global_network,
+                                                          network_is_local=True,
+                                                          spaces=spaces,
+                                                          network_is_trainable=False)
+
+    def sync(self):
+        """
+        Initializes the weights of the networks to match each other
+        :return:
+        """
+        self.update_online_network()
+        self.update_target_network()
+
+    def update_target_network(self, rate=1.0):
+        """
+        Copy weights: online network >>> target network
+        :param rate: the rate of copying the weights - 1 for copying exactly
+        """
+        if self.target_network:
+            self.target_network.set_weights(self.online_network.get_weights(), rate)
+
+    def update_online_network(self, rate=1.0):
+        """
+        Copy weights: global network >>> online network
+        :param rate: the rate of copying the weights - 1 for copying exactly
+        """
+        if self.global_network:
+            self.online_network.set_weights(self.global_network.get_weights(), rate)
+
+    def apply_gradients_to_global_network(self, gradients=None):
+        """
+        Apply gradients from the online network on the global network
+        :param gradients: optional gradients that will be used instead of teh accumulated gradients
+        :return:
+        """
+        if gradients is None:
+            gradients = self.online_network.accumulated_gradients
+        if self.network_parameters.shared_optimizer:
+            self.global_network.apply_gradients(gradients)
+        else:
+            self.online_network.apply_gradients(gradients)
+
+    def apply_gradients_to_online_network(self, gradients=None):
+        """
+        Apply gradients from the online network on itself
+        :return:
+        """
+        if gradients is None:
+            gradients = self.online_network.accumulated_gradients
+        self.online_network.apply_gradients(gradients)
+
+    def train_and_sync_networks(self, inputs, targets, additional_fetches=[], importance_weights=None):
+        """
+        A generic training function that enables multi-threading training using a global network if necessary.
+        :param inputs: The inputs for the network.
+        :param targets: The targets corresponding to the given inputs
+        :param additional_fetches: Any additional tensor the user wants to fetch
+        :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
+                                   error of this sample. If it is not given, the samples losses won't be scaled
+        :return: The loss of the training iteration
+        """
+        result = self.online_network.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
+                                                          importance_weights=importance_weights, no_accumulation=True)
+        self.apply_gradients_and_sync_networks(reset_gradients=False)
+        return result
+
+    def apply_gradients_and_sync_networks(self, reset_gradients=True):
+        """
+        Applies the gradients accumulated in the online network to the global network or to itself and syncs the
+        networks if necessary
+        :param reset_gradients: If set to True, the accumulated gradients wont be reset to 0 after applying them to
+                                the network. this is useful when the accumulated gradients are overwritten instead
+                                if accumulated by the accumulate_gradients function. this allows reducing time
+                                complexity for this function by around 10%
+        """
+        if self.global_network:
+            self.apply_gradients_to_global_network()
+            if reset_gradients:
+                self.online_network.reset_accumulated_gradients()
+            self.update_online_network()
+        else:
+            if reset_gradients:
+                self.online_network.apply_and_reset_gradients(self.online_network.accumulated_gradients)
+            else:
+                self.online_network.apply_gradients(self.online_network.accumulated_gradients)
+
+    def parallel_prediction(self, network_input_tuples: List[Tuple]):
+        """
+        Run several network prediction in parallel. Currently this only supports running each of the network once.
+        :param network_input_tuples: a list of tuples where the first element is the network (online_network,
+                                     target_network or global_network) and the second element is the inputs
+        :return: the outputs of all the networks in the same order as the inputs were given
+        """
+        feed_dict = {}
+        fetches = []
+
+        for idx, (network, input) in enumerate(network_input_tuples):
+            feed_dict.update(network.create_feed_dict(input))
+            fetches += network.outputs
+
+        outputs = self.sess.run(fetches, feed_dict)
+
+        return outputs
+
+    def get_local_variables(self):
+        """
+        Get all the variables that are local to the thread
+        :return: a list of all the variables that are local to the thread
+        """
+        local_variables = [v for v in tf.local_variables() if self.online_network.name in v.name]
+        if self.has_target:
+            local_variables += [v for v in tf.local_variables() if self.target_network.name in v.name]
+        return local_variables
+
+    def get_global_variables(self):
+        """
+        Get all the variables that are shared between threads
+        :return: a list of all the variables that are shared between threads
+        """
+        global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name]
+        return global_variables
+
+    def set_session(self, sess):
+        self.sess = sess
+        self.online_network.set_session(sess)
+        if self.global_network:
+            self.global_network.set_session(sess)
+        if self.target_network:
+            self.target_network.set_session(sess)
+
--- a/rl_coach/architectures/tensorflow_components/init.py
+++ b/rl_coach/architectures/tensorflow_components/init.py
--- a/rl_coach/architectures/tensorflow_components/architecture.py
+++ b/rl_coach/architectures/tensorflow_components/architecture.py
@@ -0,0 +1,664 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import time
+from typing import List
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import force_list, squeeze_list
+
+from rl_coach.architectures.architecture import Architecture
+from rl_coach.core_types import GradientClippingMethod
+
+
+def batchnorm_activation_dropout(input_layer, batchnorm, activation_function, dropout, dropout_rate, layer_idx):
+    layers = [input_layer]
+
+    # batchnorm
+    if batchnorm:
+        layers.append(
+            tf.layers.batch_normalization(layers[-1], name="batchnorm{}".format(layer_idx))
+        )
+
+    # activation
+    if activation_function:
+        layers.append(
+            activation_function(layers[-1], name="activation{}".format(layer_idx))
+        )
+
+    # dropout
+    if dropout:
+        layers.append(
+            tf.layers.dropout(layers[-1], dropout_rate, name="dropout{}".format(layer_idx))
+        )
+
+    # remove the input layer from the layers list
+    del layers[0]
+
+    return layers
+
+
+class Conv2d(object):
+    def __init__(self, params: List):
+        """
+        :param params: list of [num_filters, kernel_size, strides]
+        """
+        self.params = params
+
+    def __call__(self, input_layer, name: str):
+        """
+        returns a tensorflow conv2d layer
+        :param input_layer: previous layer
+        :param name: layer name
+        :return: conv2d layer
+        """
+        return tf.layers.conv2d(input_layer, filters=self.params[0], kernel_size=self.params[1], strides=self.params[2],
+                                data_format='channels_last', name=name)
+
+
+class Dense(object):
+    def __init__(self, params: List):
+        """
+        :param params: list of [num_output_neurons]
+        """
+        self.params = params
+
+    def __call__(self, input_layer, name: str):
+        """
+        returns a tensorflow dense layer
+        :param input_layer: previous layer
+        :param name: layer name
+        :return: dense layer
+        """
+        return tf.layers.dense(input_layer, self.params[0], name=name)
+
+
+def variable_summaries(var):
+    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
+    with tf.name_scope('summaries'):
+        layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2]
+
+        with tf.name_scope(layer_weight_name):
+            mean = tf.reduce_mean(var)
+            tf.summary.scalar('mean', mean)
+            with tf.name_scope('stddev'):
+                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
+            tf.summary.scalar('stddev', stddev)
+            tf.summary.scalar('max', tf.reduce_max(var))
+            tf.summary.scalar('min', tf.reduce_min(var))
+            tf.summary.histogram('histogram', var)
+
+
+def local_getter(getter, name, *args, **kwargs):
+    """
+    This is a wrapper around the tf.get_variable function which puts the variables in the local variables collection
+    instead of the global variables collection. The local variables collection will hold variables which are not shared
+    between workers. these variables are also assumed to be non-trainable (the optimizer does not apply gradients to
+    these variables), but we can calculate the gradients wrt these variables, and we can update their content.
+    """
+    kwargs['collections'] = [tf.GraphKeys.LOCAL_VARIABLES]
+    return getter(name, *args, **kwargs)
+
+
+class TensorFlowArchitecture(Architecture):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= "",
+                 global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
+        """
+        :param agent_parameters: the agent parameters
+        :param spaces: the spaces definition of the agent
+        :param name: the name of the network
+        :param global_network: the global network replica that is shared between all the workers
+        :param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
+        :param network_is_trainable: is the network trainable (we can apply gradients on it)
+        """
+        super().__init__(agent_parameters, spaces, name)
+        self.middleware = None
+        self.network_is_local = network_is_local
+        self.global_network = global_network
+        if not self.network_parameters.tensorflow_support:
+            raise ValueError('TensorFlow is not supported for this agent')
+        self.sess = None
+        self.inputs = {}
+        self.outputs = []
+        self.targets = []
+        self.importance_weights = []
+        self.losses = []
+        self.total_loss = None
+        self.trainable_weights = []
+        self.weights_placeholders = []
+        self.shared_accumulated_gradients = []
+        self.curr_rnn_c_in = None
+        self.curr_rnn_h_in = None
+        self.gradients_wrt_inputs = []
+        self.train_writer = None
+        self.accumulated_gradients = None
+        self.network_is_trainable = network_is_trainable
+
+        self.is_chief = self.ap.task_parameters.task_index == 0
+        self.network_is_global = not self.network_is_local and global_network is None
+        self.distributed_training = self.network_is_global or self.network_is_local and global_network is not None
+
+        self.optimizer_type = self.network_parameters.optimizer_type
+        if self.ap.task_parameters.seed is not None:
+            tf.set_random_seed(self.ap.task_parameters.seed)
+        with tf.variable_scope("/".join(self.name.split("/")[1:]), initializer=tf.contrib.layers.xavier_initializer(),
+                               custom_getter=local_getter if network_is_local and global_network else None):
+            self.global_step = tf.train.get_or_create_global_step()
+
+            # build the network
+            self.get_model()
+
+            # model weights
+            self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.full_name)
+
+            # create the placeholder for the assigning gradients and some tensorboard summaries for the weights
+            for idx, var in enumerate(self.weights):
+                placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder')
+                self.weights_placeholders.append(placeholder)
+                if self.ap.visualization.tensorboard:
+                    variable_summaries(var)
+
+            # create op for assigning a list of weights to the network weights
+            self.update_weights_from_list = [weights.assign(holder) for holder, weights in
+                                             zip(self.weights_placeholders, self.weights)]
+
+            # locks for synchronous training
+            if self.network_is_global:
+                self._create_locks_for_synchronous_training()
+
+            # gradients ops
+            self._create_gradient_ops()
+
+            # L2 regularization
+            if self.network_parameters.l2_regularization != 0:
+                self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.weights])
+                                          * self.network_parameters.l2_regularization]
+                tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization)
+
+            self.inc_step = self.global_step.assign_add(1)
+
+            # reset LSTM hidden cells
+            self.reset_internal_memory()
+
+            if self.ap.visualization.tensorboard:
+                current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
+                                                            scope=tf.contrib.framework.get_name_scope())
+                self.merged = tf.summary.merge(current_scope_summaries)
+
+            # initialize or restore model
+            self.init_op = tf.group(
+                tf.global_variables_initializer(),
+                tf.local_variables_initializer()
+            )
+
+            # set the fetches for training
+            self._set_initial_fetch_list()
+
+    def _set_initial_fetch_list(self):
+        """
+        Create an initial list of tensors to fetch in each training iteration
+        :return: None
+        """
+        self.train_fetches = [self.gradients_norm]
+        if self.network_parameters.clip_gradients:
+            self.train_fetches.append(self.clipped_grads)
+        else:
+            self.train_fetches.append(self.tensor_gradients)
+        self.train_fetches += [self.total_loss, self.losses]
+        if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+            self.train_fetches.append(self.middleware.state_out)
+        self.additional_fetches_start_idx = len(self.train_fetches)
+
+    def _create_locks_for_synchronous_training(self):
+        """
+        Create locks for synchronizing the different workers during training
+        :return: None
+        """
+        self.lock_counter = tf.get_variable("lock_counter", [], tf.int32,
+                                            initializer=tf.constant_initializer(0, dtype=tf.int32),
+                                            trainable=False)
+        self.lock = self.lock_counter.assign_add(1, use_locking=True)
+        self.lock_init = self.lock_counter.assign(0)
+
+        self.release_counter = tf.get_variable("release_counter", [], tf.int32,
+                                               initializer=tf.constant_initializer(0, dtype=tf.int32),
+                                               trainable=False)
+        self.release = self.release_counter.assign_add(1, use_locking=True)
+        self.release_decrement = self.release_counter.assign_add(-1, use_locking=True)
+        self.release_init = self.release_counter.assign(0)
+
+    def _create_gradient_ops(self):
+        """
+        Create all the tensorflow operations for calculating gradients, processing the gradients and applying them
+        :return: None
+        """
+
+        self.tensor_gradients = tf.gradients(self.total_loss, self.weights)
+        self.gradients_norm = tf.global_norm(self.tensor_gradients)
+
+        # gradient clipping
+        if self.network_parameters.clip_gradients is not None and self.network_parameters.clip_gradients != 0:
+            self._create_gradient_clipping_ops()
+
+        # when using a shared optimizer, we create accumulators to store gradients from all the workers before
+        # applying them
+        if self.distributed_training:
+            self._create_gradient_accumulators()
+
+        # gradients of the outputs w.r.t. the inputs
+        # at the moment, this is only used by ddpg
+        self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
+                                      self.inputs.items()} for output in self.outputs]
+        self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
+                                     for i in range(len(self.outputs))]
+        self.weighted_gradients = []
+        for i in range(len(self.outputs)):
+            unnormalized_gradients = tf.gradients(self.outputs[i], self.weights, self.gradients_weights_ph[i])
+            # unnormalized gradients seems to be better at the time. TODO: validate this accross more environments
+            # self.weighted_gradients.append(list(map(lambda x: tf.div(x, self.network_parameters.batch_size),
+            #                                         unnormalized_gradients)))
+            self.weighted_gradients.append(unnormalized_gradients)
+
+        # defining the optimization process (for LBFGS we have less control over the optimizer)
+        if self.optimizer_type != 'LBFGS' and self.network_is_trainable:
+            self._create_gradient_applying_ops()
+
+    def _create_gradient_accumulators(self):
+        if self.network_is_global:
+            self.shared_accumulated_gradients = [tf.Variable(initial_value=tf.zeros_like(var)) for var in self.weights]
+            self.accumulate_shared_gradients = [var.assign_add(holder, use_locking=True) for holder, var in
+                                                zip(self.weights_placeholders, self.shared_accumulated_gradients)]
+            self.init_shared_accumulated_gradients = [var.assign(tf.zeros_like(var)) for var in
+                                                      self.shared_accumulated_gradients]
+        elif self.network_is_local:
+            self.accumulate_shared_gradients = self.global_network.accumulate_shared_gradients
+            self.init_shared_accumulated_gradients = self.global_network.init_shared_accumulated_gradients
+
+    def _create_gradient_clipping_ops(self):
+        """
+        Create tensorflow ops for clipping the gradients according to the given GradientClippingMethod
+        :return: None
+        """
+        if self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByGlobalNorm:
+            self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients,
+                                                                         self.network_parameters.clip_gradients)
+        elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByValue:
+            self.clipped_grads = [tf.clip_by_value(grad,
+                                                   -self.network_parameters.clip_gradients,
+                                                   self.network_parameters.clip_gradients)
+                                  for grad in self.tensor_gradients]
+        elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByNorm:
+            self.clipped_grads = [tf.clip_by_norm(grad, self.network_parameters.clip_gradients)
+                                  for grad in self.tensor_gradients]
+
+    def _create_gradient_applying_ops(self):
+        """
+        Create tensorflow ops for applying the gradients to the network weights according to the training scheme
+        (distributed training - local or global network, shared optimizer, etc.)
+        :return: None
+        """
+        if self.network_is_global and self.network_parameters.shared_optimizer and \
+                not self.network_parameters.async_training:
+            # synchronous training with shared optimizer? -> create an operation for applying the gradients
+            # accumulated in the shared gradients accumulator
+            self.update_weights_from_shared_gradients = self.optimizer.apply_gradients(
+                zip(self.shared_accumulated_gradients, self.weights),
+                global_step=self.global_step)
+
+        elif self.distributed_training and self.network_is_local:
+            # distributed training but independent optimizer? -> create an operation for applying the gradients
+            # to the global weights
+            self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
+                zip(self.weights_placeholders, self.global_network.weights), global_step=self.global_step)
+
+        elif self.network_is_trainable:
+            # not any of the above but is trainable? -> create an operation for applying the gradients to
+            # this network weights
+            self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
+                zip(self.weights_placeholders, self.weights), global_step=self.global_step)
+
+    def set_session(self, sess):
+        self.sess = sess
+
+        task_is_distributed = isinstance(self.ap.task_parameters, DistributedTaskParameters)
+        # initialize the session parameters in single threaded runs. Otherwise, this is done through the
+        # MonitoredSession object in the graph manager
+        if not task_is_distributed:
+            self.sess.run(self.init_op)
+
+        if self.ap.visualization.tensorboard:
+            # Write the merged summaries to the current experiment directory
+            if not task_is_distributed:
+                self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path + '/tensorboard')
+                self.train_writer.add_graph(self.sess.graph)
+            elif self.network_is_local:
+                self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path +
+                                                          '/tensorboard/worker{}'.format(self.ap.task_parameters.task_index))
+                self.train_writer.add_graph(self.sess.graph)
+
+        # wait for all the workers to set their session
+        if not self.network_is_local:
+            self.wait_for_all_workers_barrier()
+
+    def reset_accumulated_gradients(self):
+        """
+        Reset the gradients accumulation placeholder
+        """
+        if self.accumulated_gradients is None:
+            self.accumulated_gradients = self.sess.run(self.weights)
+
+        for ix, grad in enumerate(self.accumulated_gradients):
+            self.accumulated_gradients[ix] = grad * 0
+
+    def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
+                             no_accumulation=False):
+        """
+        Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
+        placeholders
+        :param additional_fetches: Optional tensors to fetch during gradients calculation
+        :param inputs: The input batch for the network
+        :param targets: The targets corresponding to the input batch
+        :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
+                                   error of this sample. If it is not given, the samples losses won't be scaled
+        :param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
+                                replaced by the newely calculated gradients instead of accumulating the new gradients.
+                                This can speed up the function runtime by around 10%.
+        :return: A list containing the total loss and the individual network heads losses
+        """
+
+        if self.accumulated_gradients is None:
+            self.reset_accumulated_gradients()
+
+        # feed inputs
+        if additional_fetches is None:
+            additional_fetches = []
+        feed_dict = self.create_feed_dict(inputs)
+
+        # feed targets
+        targets = force_list(targets)
+        for placeholder_idx, target in enumerate(targets):
+            feed_dict[self.targets[placeholder_idx]] = target
+
+        # feed importance weights
+        importance_weights = force_list(importance_weights)
+        for placeholder_idx, target_ph in enumerate(targets):
+            if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
+                importance_weight = np.ones(target_ph.shape[0])
+            else:
+                importance_weight = importance_weights[placeholder_idx]
+            importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
+
+            feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
+
+        if self.optimizer_type != 'LBFGS':
+
+            # feed the lstm state if necessary
+            if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+                # we can't always assume that we are starting from scratch here can we?
+                feed_dict[self.middleware.c_in] = self.middleware.c_init
+                feed_dict[self.middleware.h_in] = self.middleware.h_init
+
+            fetches = self.train_fetches + additional_fetches
+            if self.ap.visualization.tensorboard:
+                fetches += [self.merged]
+
+            # get grads
+            result = self.sess.run(fetches, feed_dict=feed_dict)
+            if hasattr(self, 'train_writer') and self.train_writer is not None:
+                self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))
+
+            # extract the fetches
+            norm_unclipped_grads, grads, total_loss, losses = result[:4]
+            if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+                (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
+            fetched_tensors = []
+            if len(additional_fetches) > 0:
+                fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
+                                                                      len(additional_fetches)]
+
+            # accumulate the gradients
+            for idx, grad in enumerate(grads):
+                if no_accumulation:
+                    self.accumulated_gradients[idx] = grad
+                else:
+                    self.accumulated_gradients[idx] += grad
+
+            return total_loss, losses, norm_unclipped_grads, fetched_tensors
+
+        else:
+            self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)
+
+            return [0]
+
+    def create_feed_dict(self, inputs):
+        feed_dict = {}
+        for input_name, input_value in inputs.items():
+            if isinstance(input_name, str):
+                if input_name not in self.inputs:
+                    raise ValueError((
+                        'input name {input_name} was provided to create a feed '
+                        'dictionary, but there is no placeholder with that name. '
+                        'placeholder names available include: {placeholder_names}'
+                    ).format(
+                        input_name=input_name,
+                        placeholder_names=', '.join(self.inputs.keys())
+                    ))
+
+                feed_dict[self.inputs[input_name]] = input_value
+            elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder':
+                feed_dict[input_name] = input_value
+            else:
+                raise ValueError((
+                    'input dictionary expects strings or placeholders as keys, '
+                    'but found key {key} of type {type}'
+                ).format(
+                    key=input_name,
+                    type=type(input_name),
+                ))
+
+        return feed_dict
+
+    def apply_and_reset_gradients(self, gradients, scaler=1.):
+        """
+        Applies the given gradients to the network weights and resets the accumulation placeholder
+        :param gradients: The gradients to use for the update
+        :param scaler: A scaling factor that allows rescaling the gradients before applying them
+        """
+        self.apply_gradients(gradients, scaler)
+        self.reset_accumulated_gradients()
+
+    def wait_for_all_workers_to_lock(self, lock: str, include_only_training_workers: bool=False):
+        """
+        Waits for all the workers to lock a certain lock and then continues
+        :param lock: the name of the lock to use
+        :param include_only_training_workers: wait only for training workers or for all the workers?
+        :return: None
+        """
+        if include_only_training_workers:
+            num_workers_to_wait_for = self.ap.task_parameters.num_training_tasks
+        else:
+            num_workers_to_wait_for = self.ap.task_parameters.num_tasks
+
+        # lock
+        if hasattr(self, '{}_counter'.format(lock)):
+            self.sess.run(getattr(self, lock))
+            while self.sess.run(getattr(self, '{}_counter'.format(lock))) % num_workers_to_wait_for != 0:
+                time.sleep(0.00001)
+            # self.sess.run(getattr(self, '{}_init'.format(lock)))
+        else:
+            raise ValueError("no counter was defined for the lock {}".format(lock))
+
+    def wait_for_all_workers_barrier(self, include_only_training_workers: bool=False):
+        """
+        A barrier that allows waiting for all the workers to finish a certain block of commands
+        :param include_only_training_workers: wait only for training workers or for all the workers?
+        :return: None
+        """
+        self.wait_for_all_workers_to_lock('lock', include_only_training_workers=include_only_training_workers)
+        self.sess.run(self.lock_init)
+
+        # we need to lock again (on a different lock) in order to prevent a situation where one of the workers continue
+        # and then was able to first increase the lock again by one, only to have a late worker to reset it again.
+        # so we want to make sure that all workers are done resetting the lock before continuting to reuse that lock.
+
+        self.wait_for_all_workers_to_lock('release', include_only_training_workers=include_only_training_workers)
+        self.sess.run(self.release_init)
+
+    def apply_gradients(self, gradients, scaler=1.):
+        """
+        Applies the given gradients to the network weights
+        :param gradients: The gradients to use for the update
+        :param scaler: A scaling factor that allows rescaling the gradients before applying them.
+                       The gradients will be MULTIPLIED by this factor
+        """
+        if self.network_parameters.async_training or not isinstance(self.ap.task_parameters, DistributedTaskParameters):
+            if hasattr(self, 'global_step') and not self.network_is_local:
+                self.sess.run(self.inc_step)
+
+        if self.optimizer_type != 'LBFGS':
+
+            if self.distributed_training and not self.network_parameters.async_training:
+                # rescale the gradients so that they average out with the gradients from the other workers
+                if self.network_parameters.scale_down_gradients_by_number_of_workers_for_sync_training:
+                    scaler /= float(self.ap.task_parameters.num_training_tasks)
+
+            # rescale the gradients
+            if scaler != 1.:
+                for gradient in gradients:
+                    gradient *= scaler
+
+            # apply the gradients
+            feed_dict = dict(zip(self.weights_placeholders, gradients))
+            if self.distributed_training and self.network_parameters.shared_optimizer \
+                    and not self.network_parameters.async_training:
+                # synchronous distributed training with shared optimizer:
+                # - each worker adds its gradients to the shared gradients accumulators
+                # - we wait for all the workers to add their gradients
+                # - the chief worker (worker with task index = 0) applies the gradients once and resets the accumulators
+
+                self.sess.run(self.accumulate_shared_gradients, feed_dict=feed_dict)
+
+                self.wait_for_all_workers_barrier(include_only_training_workers=True)
+
+                if self.is_chief:
+                    self.sess.run(self.update_weights_from_shared_gradients)
+                    self.sess.run(self.init_shared_accumulated_gradients)
+            else:
+                # async distributed training / distributed training with independent optimizer
+                #  / non-distributed training - just apply the gradients
+                feed_dict = dict(zip(self.weights_placeholders, gradients))
+                self.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict)
+
+            # release barrier
+            if self.distributed_training and not self.network_parameters.async_training:
+                self.wait_for_all_workers_barrier(include_only_training_workers=True)
+
+    def predict(self, inputs, outputs=None, squeeze_output=True, initial_feed_dict=None):
+        """
+        Run a forward pass of the network using the given input
+        :param inputs: The input for the network
+        :param outputs: The output for the network, defaults to self.outputs
+        :param squeeze_output: call squeeze_list on output
+        :param initial_feed_dict: a dictionary to use as the initial feed_dict. other inputs will be added to this dict
+        :return: The network output
+
+        WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
+        """
+        feed_dict = self.create_feed_dict(inputs)
+        if initial_feed_dict:
+            feed_dict.update(initial_feed_dict)
+        if outputs is None:
+            outputs = self.outputs
+
+        if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+            feed_dict[self.middleware.c_in] = self.curr_rnn_c_in
+            feed_dict[self.middleware.h_in] = self.curr_rnn_h_in
+
+            output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.sess.run([outputs, self.middleware.state_out],
+                                                                             feed_dict=feed_dict)
+        else:
+            output = self.sess.run(outputs, feed_dict)
+
+        if squeeze_output:
+            output = squeeze_list(output)
+        return output
+
+    def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None):
+        """
+        Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
+        :param additional_fetches: Optional tensors to fetch during the training process
+        :param inputs: The input for the network
+        :param targets: The targets corresponding to the input batch
+        :param scaler: A scaling factor that allows rescaling the gradients before applying them
+        :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
+                                   error of this sample. If it is not given, the samples losses won't be scaled
+        :return: The loss of the network
+        """
+        if additional_fetches is None:
+            additional_fetches = []
+        force_list(additional_fetches)
+        loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
+                                         importance_weights=importance_weights)
+        self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
+        return loss
+
+    def get_weights(self):
+        """
+        :return: a list of tensors containing the network weights for each layer
+        """
+        return self.weights
+
+    def set_weights(self, weights, new_rate=1.0):
+        """
+        Sets the network weights from the given list of weights tensors
+        """
+        feed_dict = {}
+        old_weights, new_weights = self.sess.run([self.get_weights(), weights])
+        for placeholder_idx, new_weight in enumerate(new_weights):
+            feed_dict[self.weights_placeholders[placeholder_idx]]\
+                = new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx]
+        self.sess.run(self.update_weights_from_list, feed_dict)
+
+    def get_variable_value(self, variable):
+        """
+        Get the value of a variable from the graph
+        :param variable: the variable
+        :return: the value of the variable
+        """
+        return self.sess.run(variable)
+
+    def set_variable_value(self, assign_op, value, placeholder=None):
+        """
+        Updates the value of a variable.
+        This requires having an assign operation for the variable, and a placeholder which will provide the value
+        :param assign_op: an assign operation for the variable
+        :param value: a value to set the variable to
+        :param placeholder: a placeholder to hold the given value for injecting it into the variable
+        """
+        self.sess.run(assign_op, feed_dict={placeholder: value})
+
+    def reset_internal_memory(self):
+        """
+        Reset any internal memory used by the network. For example, an LSTM internal state
+        :return: None
+        """
+        # initialize LSTM hidden states
+        if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+            self.curr_rnn_c_in = self.middleware.c_init
+            self.curr_rnn_h_in = self.middleware.h_init
--- a/rl_coach/architectures/tensorflow_components/distributed_tf_utils.py
+++ b/rl_coach/architectures/tensorflow_components/distributed_tf_utils.py
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Tuple
+
+import tensorflow as tf
+
+
+def create_cluster_spec(parameters_server: str, workers: str) -> tf.train.ClusterSpec:
+    """
+    Creates a ClusterSpec object representing the cluster.
+    :param parameters_server: comma-separated list of hostname:port pairs to which the parameter servers are assigned
+    :param workers: comma-separated list of hostname:port pairs to which the workers are assigned
+    :return: a ClusterSpec object representing the cluster
+    """
+    # extract the parameter servers and workers from the given strings
+    ps_hosts = parameters_server.split(",")
+    worker_hosts = workers.split(",")
+
+    # Create a cluster spec from the parameter server and worker hosts
+    cluster_spec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
+
+    return cluster_spec
+
+
+def create_and_start_parameters_server(cluster_spec: tf.train.ClusterSpec, config: tf.ConfigProto=None) -> None:
+    """
+    Create and start a parameter server
+    :param cluster_spec: the ClusterSpec object representing the cluster
+    :param config: the tensorflow config to use
+    :return: None
+    """
+    # create a server object for the parameter server
+    server = tf.train.Server(cluster_spec, job_name="ps", task_index=0, config=config)
+
+    # wait for the server to finish
+    server.join()
+
+
+def create_worker_server_and_device(cluster_spec: tf.train.ClusterSpec, task_index: int,
+                                    use_cpu: bool=True, config: tf.ConfigProto=None) -> Tuple[str, tf.device]:
+    """
+    Creates a worker server and a device setter used to assign the workers operations to
+    :param cluster_spec: a ClusterSpec object representing the cluster
+    :param task_index: the index of the worker task
+    :param use_cpu: if use_cpu=True, all the agent operations will be assigned to a CPU instead of a GPU
+    :param config: the tensorflow config to use
+    :return: the target string for the tf.Session and the worker device setter object
+    """
+    # Create and start a worker
+    server = tf.train.Server(cluster_spec, job_name="worker", task_index=task_index, config=config)
+
+    # Assign ops to the local worker
+    worker_device = "/job:worker/task:{}".format(task_index)
+    if use_cpu:
+        worker_device += "/cpu:0"
+    else:
+        worker_device += "/device:GPU:0"
+    device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec)
+
+    return server.target, device
+
+
+def create_monitored_session(target: tf.train.Server, task_index: int,
+                             checkpoint_dir: str, save_checkpoint_secs: int, config: tf.ConfigProto=None) -> tf.Session:
+    """
+    Create a monitored session for the worker
+    :param target: the target string for the tf.Session
+    :param task_index: the task index of the worker
+    :param checkpoint_dir: a directory path where the checkpoints will be stored
+    :param save_checkpoint_secs: number of seconds between checkpoints storing
+    :param config: the tensorflow configuration (optional)
+    :return: the session to use for the run
+    """
+    # we chose the first task to be the chief
+    is_chief = task_index == 0
+
+    # Create the monitored session
+    sess = tf.train.MonitoredTrainingSession(
+        master=target,
+        is_chief=is_chief,
+        hooks=[],
+        checkpoint_dir=checkpoint_dir,
+        save_checkpoint_secs=save_checkpoint_secs,
+        config=config
+    )
+
+    return sess
+
--- a/rl_coach/architectures/tensorflow_components/embedders/init.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/init.py
--- a/rl_coach/architectures/tensorflow_components/embedders/embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/embedder.py
@@ -0,0 +1,114 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Union
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.shared_variables import SharedRunningStats
+from rl_coach.base_parameters import EmbedderScheme
+
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
+from rl_coach.core_types import InputEmbedding
+
+
+class InputEmbedder(object):
+    """
+    An input embedder is the first part of the network, which takes the input from the state and produces a vector
+    embedding by passing it through a neural network. The embedder will mostly be input type dependent, and there
+    can be multiple embedders in a single network
+    """
+    def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
+                 scheme: EmbedderScheme=None, batchnorm: bool=False, dropout: bool=False,
+                 name: str= "embedder", input_rescaling=1.0, input_offset=0.0, input_clipping=None):
+        self.name = name
+        self.input_size = input_size
+        self.activation_function = activation_function
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+        self.dropout_rate = 0
+        self.input = None
+        self.output = None
+        self.scheme = scheme
+        self.return_type = InputEmbedding
+        self.layers = []
+        self.input_rescaling = input_rescaling
+        self.input_offset = input_offset
+        self.input_clipping = input_clipping
+
+    def __call__(self, prev_input_placeholder=None):
+        with tf.variable_scope(self.get_name()):
+            if prev_input_placeholder is None:
+                self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
+            else:
+                self.input = prev_input_placeholder
+            self._build_module()
+
+        return self.input, self.output
+
+    def _build_module(self):
+        # NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
+        #  to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
+        #  input to the network to be float, which is 4x more expensive in memory.
+        #  thus causing each saved transition in the memory to also be 4x more pricier.
+
+        input_layer = self.input / self.input_rescaling
+        input_layer -= self.input_offset
+        # clip input using te given range
+        if self.input_clipping is not None:
+            input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1])
+
+        self.layers.append(input_layer)
+
+        # layers order is conv -> batchnorm -> activation -> dropout
+        if isinstance(self.scheme, EmbedderScheme):
+            layers_params = self.schemes[self.scheme]
+        else:
+            layers_params = self.scheme
+        for idx, layer_params in enumerate(layers_params):
+            self.layers.append(
+                layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
+            )
+
+            self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
+                                                            self.activation_function, self.dropout,
+                                                            self.dropout_rate, idx))
+
+        self.output = tf.contrib.layers.flatten(self.layers[-1])
+
+    @property
+    def input_size(self) -> List[int]:
+        return self._input_size
+
+    @input_size.setter
+    def input_size(self, value: Union[int, List[int]]):
+        if isinstance(value, np.ndarray) or isinstance(value, tuple):
+            value = list(value)
+        elif isinstance(value, int):
+            value = [value]
+        if not isinstance(value, list):
+            raise ValueError((
+                'input_size expected to be a list, found {value} which has type {type}'
+            ).format(value=value, type=type(value)))
+        self._input_size = value
+
+    @property
+    def schemes(self):
+        raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
+                                  "configurations.")
+
+    def get_name(self):
+        return self.name
--- a/rl_coach/architectures/tensorflow_components/embedders/image_embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/image_embedder.py
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.architecture import Conv2d
+from rl_coach.base_parameters import EmbedderScheme
+
+from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
+from rl_coach.core_types import InputImageEmbedding
+
+
+class ImageEmbedder(InputEmbedder):
+    """
+    An input embedder that performs convolutions on the input and then flattens the result.
+    The embedder is intended for image like inputs, where the channels are expected to be the last axis.
+    The embedder also allows custom rescaling of the input prior to the neural network.
+    """
+    schemes = {
+        EmbedderScheme.Empty:
+            [],
+
+        EmbedderScheme.Shallow:
+            [
+                Conv2d([32, 3, 1])
+            ],
+
+        # atari dqn
+        EmbedderScheme.Medium:
+            [
+                Conv2d([32, 8, 4]),
+                Conv2d([64, 4, 2]),
+                Conv2d([64, 3, 1])
+            ],
+
+        # carla
+        EmbedderScheme.Deep: \
+            [
+                Conv2d([32, 5, 2]),
+                Conv2d([32, 3, 1]),
+                Conv2d([64, 3, 2]),
+                Conv2d([64, 3, 1]),
+                Conv2d([128, 3, 2]),
+                Conv2d([128, 3, 1]),
+                Conv2d([256, 3, 2]),
+                Conv2d([256, 3, 1])
+            ]
+    }
+
+    def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
+                 scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
+                 name: str= "embedder", input_rescaling: float=255.0, input_offset: float=0.0, input_clipping=None):
+        super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling,
+                         input_offset, input_clipping)
+        self.return_type = InputImageEmbedding
+        if len(input_size) != 3 and scheme != EmbedderScheme.Empty:
+            raise ValueError("Image embedders expect the input size to have 3 dimensions. The given size is: {}"
+                             .format(input_size))
+
+
--- a/rl_coach/architectures/tensorflow_components/embedders/vector_embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/vector_embedder.py
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.architecture import Dense
+from rl_coach.base_parameters import EmbedderScheme
+
+from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
+from rl_coach.core_types import InputVectorEmbedding
+
+
+class VectorEmbedder(InputEmbedder):
+    """
+    An input embedder that is intended for inputs that can be represented as vectors.
+    The embedder flattens the input, applies several dense layers to it and returns the output.
+    """
+    schemes = {
+        EmbedderScheme.Empty:
+            [],
+
+        EmbedderScheme.Shallow:
+            [
+                Dense([128])
+            ],
+
+        # dqn
+        EmbedderScheme.Medium:
+            [
+                Dense([256])
+            ],
+
+        # carla
+        EmbedderScheme.Deep: \
+            [
+                Dense([128]),
+                Dense([128]),
+                Dense([128])
+            ]
+    }
+
+    def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
+                 scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
+                 name: str= "embedder", input_rescaling: float=1.0, input_offset:float=0.0, input_clipping=None):
+        super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name,
+                         input_rescaling, input_offset, input_clipping)
+
+        self.return_type = InputVectorEmbedding
+        if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty:
+            raise ValueError("The input size of a vector embedder must contain only a single dimension")
--- a/rl_coach/architectures/tensorflow_components/general_network.py
+++ b/rl_coach/architectures/tensorflow_components/general_network.py
@@ -0,0 +1,344 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from typing import Dict
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.middleware import MiddlewareParameters
+from rl_coach.base_parameters import AgentParameters, InputEmbedderParameters, EmbeddingMergerType
+from rl_coach.spaces import SpacesDefinition, PlanarMapsObservationSpace
+from rl_coach.utils import get_all_subclasses, dynamic_import_and_instantiate_module_from_params
+
+from rl_coach.architectures.tensorflow_components.architecture import TensorFlowArchitecture
+from rl_coach.core_types import PredictionType
+
+
+class GeneralTensorFlowNetwork(TensorFlowArchitecture):
+    """
+    A generalized version of all possible networks implemented using tensorflow.
+    """
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str,
+                 global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
+        """
+        :param agent_parameters: the agent parameters
+        :param spaces: the spaces definition of the agent
+        :param name: the name of the network
+        :param global_network: the global network replica that is shared between all the workers
+        :param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
+        :param network_is_trainable: is the network trainable (we can apply gradients on it)
+        """
+        self.global_network = global_network
+        self.network_is_local = network_is_local
+        self.network_wrapper_name = name.split('/')[0]
+        self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
+        self.num_heads_per_network = 1 if self.network_parameters.use_separate_networks_per_head else \
+            len(self.network_parameters.heads_parameters)
+        self.num_networks = 1 if not self.network_parameters.use_separate_networks_per_head else \
+            len(self.network_parameters.heads_parameters)
+
+        self.gradients_from_head_rescalers = []
+        self.gradients_from_head_rescalers_placeholders = []
+        self.update_head_rescaler_value_ops = []
+
+        self.adaptive_learning_rate_scheme = None
+        self.current_learning_rate = None
+
+        # init network modules containers
+        self.input_embedders = []
+        self.output_heads = []
+        super().__init__(agent_parameters, spaces, name, global_network,
+                         network_is_local, network_is_trainable)
+
+        def fill_return_types():
+            ret_dict = {}
+            for cls in get_all_subclasses(PredictionType):
+                ret_dict[cls] = []
+            components = self.input_embedders + [self.middleware] + self.output_heads
+            for component in components:
+                if not hasattr(component, 'return_type'):
+                    raise ValueError("{} has no return_type attribute. This should not happen.")
+                if component.return_type is not None:
+                    ret_dict[component.return_type].append(component)
+
+            return ret_dict
+
+        self.available_return_types = fill_return_types()
+
+    def predict_with_prediction_type(self, states: Dict[str, np.ndarray],
+                                     prediction_type: PredictionType) -> Dict[str, np.ndarray]:
+        """
+        Search for a component[s] which has a return_type set to the to the requested PredictionType, and get
+        predictions for it.
+
+        :param states: The input states to the network.
+        :param prediction_type: The requested PredictionType to look for in the network components
+        :return: A dictionary with predictions for all components matching the requested prediction type
+        """
+
+        ret_dict = {}
+        for component in self.available_return_types[prediction_type]:
+            ret_dict[component] = self.predict(inputs=states, outputs=component.output)
+
+        return ret_dict
+
+    @staticmethod
+    def get_activation_function(activation_function_string: str):
+        """
+        Map the activation function from a string to the tensorflow framework equivalent
+        :param activation_function_string: the type of the activation function
+        :return: the tensorflow activation function
+        """
+        activation_functions = {
+            'relu': tf.nn.relu,
+            'tanh': tf.nn.tanh,
+            'sigmoid': tf.nn.sigmoid,
+            'elu': tf.nn.elu,
+            'selu': tf.nn.selu,
+            'leaky_relu': tf.nn.leaky_relu,
+            'none': None
+        }
+        assert activation_function_string in activation_functions.keys(), \
+            "Activation function must be one of the following {}. instead it was: {}"\
+                .format(activation_functions.keys(), activation_function_string)
+        return activation_functions[activation_function_string]
+
+    def get_input_embedder(self, input_name: str, embedder_params: InputEmbedderParameters):
+        """
+        Given an input embedder parameters class, creates the input embedder and returns it
+        :param input_name: the name of the input to the embedder (used for retrieving the shape). The input should
+                           be a value within the state or the action.
+        :param embedder_params: the parameters of the class of the embedder
+        :return: the embedder instance
+        """
+        allowed_inputs = copy.copy(self.spaces.state.sub_spaces)
+        allowed_inputs["action"] = copy.copy(self.spaces.action)
+        allowed_inputs["goal"] = copy.copy(self.spaces.goal)
+
+        if input_name not in allowed_inputs.keys():
+            raise ValueError("The key for the input embedder ({}) must match one of the following keys: {}"
+                             .format(input_name, allowed_inputs.keys()))
+
+        type = "vector"
+        if isinstance(allowed_inputs[input_name], PlanarMapsObservationSpace):
+            type = "image"
+
+        embedder_path = 'rl_coach.architectures.tensorflow_components.embedders.' + embedder_params.path[type]
+        embedder_params_copy = copy.copy(embedder_params)
+        embedder_params_copy.activation_function = self.get_activation_function(embedder_params.activation_function)
+        embedder_params_copy.input_rescaling = embedder_params_copy.input_rescaling[type]
+        embedder_params_copy.input_offset = embedder_params_copy.input_offset[type]
+        embedder_params_copy.name = input_name
+        module = dynamic_import_and_instantiate_module_from_params(embedder_params_copy,
+                                                                   path=embedder_path,
+                                                                   positional_args=[allowed_inputs[input_name].shape])
+        return module
+
+    def get_middleware(self, middleware_params: MiddlewareParameters):
+        """
+        Given a middleware type, creates the middleware and returns it
+        :param middleware_params: the paramaeters of the middleware class
+        :return: the middleware instance
+        """
+        middleware_params_copy = copy.copy(middleware_params)
+        middleware_params_copy.activation_function = self.get_activation_function(middleware_params.activation_function)
+        module = dynamic_import_and_instantiate_module_from_params(middleware_params_copy)
+        return module
+
+    def get_output_head(self, head_params: HeadParameters, head_idx: int, loss_weight: float=1.):
+        """
+        Given a head type, creates the head and returns it
+        :param head_params: the parameters of the head to create
+        :param head_type: the path to the class of the head under the embedders directory or a full path to a head class.
+                          the path should be in the following structure: <module_path>:<class_path>
+        :param head_idx: the head index
+        :param loss_weight: the weight to assign for the embedders loss
+        :return: the head
+        """
+
+        head_params_copy = copy.copy(head_params)
+        head_params_copy.activation_function = self.get_activation_function(head_params_copy.activation_function)
+        return dynamic_import_and_instantiate_module_from_params(head_params_copy, extra_kwargs={
+            'agent_parameters': self.ap, 'spaces': self.spaces, 'network_name': self.network_wrapper_name,
+            'head_idx': head_idx, 'loss_weight': loss_weight, 'is_local': self.network_is_local})
+
+    def get_model(self):
+        # validate the configuration
+        if len(self.network_parameters.input_embedders_parameters) == 0:
+            raise ValueError("At least one input type should be defined")
+
+        if len(self.network_parameters.heads_parameters) == 0:
+            raise ValueError("At least one output type should be defined")
+
+        if self.network_parameters.middleware_parameters is None:
+            raise ValueError("Exactly one middleware type should be defined")
+
+        if len(self.network_parameters.loss_weights) == 0:
+            raise ValueError("At least one loss weight should be defined")
+
+        if len(self.network_parameters.heads_parameters) != len(self.network_parameters.loss_weights):
+            raise ValueError("Number of loss weights should match the number of output types")
+
+        for network_idx in range(self.num_networks):
+            with tf.variable_scope('network_{}'.format(network_idx)):
+
+                ####################
+                # Input Embeddings #
+                ####################
+
+                state_embedding = []
+                for input_name in sorted(self.network_parameters.input_embedders_parameters):
+                    input_type = self.network_parameters.input_embedders_parameters[input_name]
+                    # get the class of the input embedder
+                    input_embedder = self.get_input_embedder(input_name, input_type)
+                    self.input_embedders.append(input_embedder)
+
+                    # input placeholders are reused between networks. on the first network, store the placeholders
+                    # generated by the input_embedders in self.inputs. on the rest of the networks, pass
+                    # the existing input_placeholders into the input_embedders.
+                    if network_idx == 0:
+                        input_placeholder, embedding = input_embedder()
+                        self.inputs[input_name] = input_placeholder
+                    else:
+                        input_placeholder, embedding = input_embedder(self.inputs[input_name])
+
+                    state_embedding.append(embedding)
+
+                ##########
+                # Merger #
+                ##########
+
+                if len(state_embedding) == 1:
+                    state_embedding = state_embedding[0]
+                else:
+                    if self.network_parameters.embedding_merger_type == EmbeddingMergerType.Concat:
+                        state_embedding = tf.concat(state_embedding, axis=-1, name="merger")
+                    elif self.network_parameters.embedding_merger_type == EmbeddingMergerType.Sum:
+                        state_embedding = tf.add_n(state_embedding, name="merger")
+
+                ##############
+                # Middleware #
+                ##############
+
+                self.middleware = self.get_middleware(self.network_parameters.middleware_parameters)
+                _, self.state_embedding = self.middleware(state_embedding)
+
+                ################
+                # Output Heads #
+                ################
+
+                head_count = 0
+                for head_idx in range(self.num_heads_per_network):
+                    for head_copy_idx in range(self.network_parameters.num_output_head_copies):
+                        if self.network_parameters.use_separate_networks_per_head:
+                            # if we use separate networks per head, then the head type corresponds top the network idx
+                            head_type_idx = network_idx
+                            head_count = network_idx
+                        else:
+                            # if we use a single network with multiple embedders, then the head type is the current head idx
+                            head_type_idx = head_idx
+                        self.output_heads.append(
+                            self.get_output_head(self.network_parameters.heads_parameters[head_type_idx],
+                                                 head_copy_idx,
+                                                 self.network_parameters.loss_weights[head_type_idx])
+                        )
+
+                        # rescale the gradients from the head
+                        self.gradients_from_head_rescalers.append(
+                            tf.get_variable('gradients_from_head_{}-{}_rescalers'.format(head_idx, head_copy_idx),
+                                            initializer=float(
+                                                self.network_parameters.rescale_gradient_from_head_by_factor[head_count]
+                                            ),
+                                            dtype=tf.float32))
+
+                        self.gradients_from_head_rescalers_placeholders.append(
+                            tf.placeholder('float',
+                                           name='gradients_from_head_{}-{}_rescalers'.format(head_type_idx, head_copy_idx)))
+
+                        self.update_head_rescaler_value_ops.append(self.gradients_from_head_rescalers[head_count].assign(
+                            self.gradients_from_head_rescalers_placeholders[head_count]))
+
+                        head_input = (1-self.gradients_from_head_rescalers[head_count]) * tf.stop_gradient(self.state_embedding) + \
+                                     self.gradients_from_head_rescalers[head_count] * self.state_embedding
+
+                        # build the head
+                        if self.network_is_local:
+                            output, target_placeholder, input_placeholders, importance_weight_ph = \
+                                self.output_heads[-1](head_input)
+
+                            self.targets.extend(target_placeholder)
+                            self.importance_weights.extend(importance_weight_ph)
+                        else:
+                            output, input_placeholders = self.output_heads[-1](head_input)
+
+                        self.outputs.extend(output)
+                        # TODO: use head names as well
+                        for placeholder_index, input_placeholder in enumerate(input_placeholders):
+                            self.inputs['output_{}_{}'.format(head_type_idx, placeholder_index)] = input_placeholder
+
+                        head_count += 1
+
+        # Losses
+        self.losses = tf.losses.get_losses(self.full_name)
+        self.losses += tf.losses.get_regularization_losses(self.full_name)
+        self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.full_name)
+        # tf.summary.scalar('total_loss', self.total_loss)
+
+        # Learning rate
+        if self.network_parameters.learning_rate_decay_rate != 0:
+            self.adaptive_learning_rate_scheme = \
+                tf.train.exponential_decay(
+                    self.network_parameters.learning_rate,
+                    self.global_step,
+                    decay_steps=self.network_parameters.learning_rate_decay_steps,
+                    decay_rate=self.network_parameters.learning_rate_decay_rate,
+                    staircase=True)
+
+            self.current_learning_rate = self.adaptive_learning_rate_scheme
+        else:
+            self.current_learning_rate = self.network_parameters.learning_rate
+
+        # Optimizer
+        if self.distributed_training and self.network_is_local and self.network_parameters.shared_optimizer:
+            # distributed training + is a local network + optimizer shared -> take the global optimizer
+            self.optimizer = self.global_network.optimizer
+        elif (self.distributed_training and self.network_is_local and not self.network_parameters.shared_optimizer) \
+                or self.network_parameters.shared_optimizer or not self.distributed_training:
+            # distributed training + is a global network + optimizer shared
+            # OR
+            # distributed training + is a local network + optimizer not shared
+            # OR
+            # non-distributed training
+            # -> create an optimizer
+
+            if self.network_parameters.optimizer_type == 'Adam':
+                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.current_learning_rate,
+                                                        beta1=self.network_parameters.adam_optimizer_beta1,
+                                                        beta2=self.network_parameters.adam_optimizer_beta2,
+                                                        epsilon=self.network_parameters.optimizer_epsilon)
+            elif self.network_parameters.optimizer_type == 'RMSProp':
+                self.optimizer = tf.train.RMSPropOptimizer(self.current_learning_rate,
+                                                           decay=self.network_parameters.rms_prop_optimizer_decay,
+                                                           epsilon=self.network_parameters.optimizer_epsilon)
+            elif self.network_parameters.optimizer_type == 'LBFGS':
+                self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
+                                                                        options={'maxiter': 25})
+            else:
+                raise Exception("{} is not a valid optimizer type".format(self.network_parameters.optimizer_type))
+
+
--- a/rl_coach/architectures/tensorflow_components/heads/init.py
+++ b/rl_coach/architectures/tensorflow_components/heads/init.py
--- a/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class CategoricalQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
+        super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)
+
+
+class CategoricalQHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'categorical_dqn_head'
+        self.num_actions = len(self.spaces.action.actions)
+        self.num_atoms = agent_parameters.algorithm.atoms
+        self.return_type = QActionStateValue
+
+    def _build_module(self, input_layer):
+        self.actions = tf.placeholder(tf.int32, [None], name="actions")
+        self.input = [self.actions]
+
+        values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
+        values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
+                                                               self.num_atoms))
+        # softmax on atoms dimension
+        self.output = tf.nn.softmax(values_distribution)
+
+        # calculate cross entropy loss
+        self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
+                                            name="distributions")
+        self.target = self.distributions
+        self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
+        tf.losses.add_loss(self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/ddpg_actor_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ddpg_actor_head.py
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.core_types import ActionProbabilities
+
+
+class DDPGActorHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True):
+        super().__init__(parameterized_class=DDPGActor, activation_function=activation_function, name=name)
+        self.batchnorm = batchnorm
+
+
+class DDPGActor(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
+                 batchnorm: bool=True):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'ddpg_actor_head'
+        self.return_type = ActionProbabilities
+
+        self.num_actions = self.spaces.action.shape
+
+        self.batchnorm = batchnorm
+
+        # bounded actions
+        self.output_scale = self.spaces.action.max_abs_range
+
+        # a scalar weight that penalizes high activation values (before the activation function) for the final layer
+        if hasattr(agent_parameters.algorithm, 'action_penalty'):
+            self.action_penalty = agent_parameters.algorithm.action_penalty
+
+    def _build_module(self, input_layer):
+        # mean
+        pre_activation_policy_values_mean = tf.layers.dense(input_layer, self.num_actions, name='fc_mean')
+        policy_values_mean = batchnorm_activation_dropout(pre_activation_policy_values_mean, self.batchnorm,
+                                                          self.activation_function,
+                                                          False, 0, 0)[-1]
+        self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
+
+        if self.is_local:
+            # add a squared penalty on the squared pre-activation features of the action
+            if self.action_penalty and self.action_penalty != 0:
+                self.regularizations += \
+                    [self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
+
+        self.output = [self.policy_mean]
--- a/rl_coach/architectures/tensorflow_components/heads/dnd_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/dnd_q_head.py
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
+
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.memories.non_episodic import differentiable_neural_dictionary
+
+
+class DNDQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params'):
+        super().__init__(parameterized_class=DNDQHead, activation_function=activation_function, name=name)
+
+
+class DNDQHead(QHead):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'dnd_q_values_head'
+        self.DND_size = agent_parameters.algorithm.dnd_size
+        self.DND_key_error_threshold = agent_parameters.algorithm.DND_key_error_threshold
+        self.l2_norm_added_delta = agent_parameters.algorithm.l2_norm_added_delta
+        self.new_value_shift_coefficient = agent_parameters.algorithm.new_value_shift_coefficient
+        self.number_of_nn = agent_parameters.algorithm.number_of_knn
+        self.ap = agent_parameters
+        self.dnd_embeddings = [None] * self.num_actions
+        self.dnd_values = [None] * self.num_actions
+        self.dnd_indices = [None] * self.num_actions
+        self.dnd_distances = [None] * self.num_actions
+        if self.ap.memory.shared_memory:
+            self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
+
+    def _build_module(self, input_layer):
+        if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir') and self.ap.task_parameters.checkpoint_restore_dir:
+            self.DND = differentiable_neural_dictionary.load_dnd(self.ap.task_parameters.checkpoint_restore_dir)
+        else:
+            self.DND = differentiable_neural_dictionary.QDND(
+                self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
+                key_error_threshold=self.DND_key_error_threshold,
+                learning_rate=self.network_parameters.learning_rate,
+                num_neighbors=self.number_of_nn,
+                override_existing_keys=True)
+
+        # Retrieve info from DND dictionary
+        # We assume that all actions have enough entries in the DND
+        self.output = tf.transpose([
+            self._q_value(input_layer, action)
+            for action in range(self.num_actions)
+        ])
+
+    def _q_value(self, input_layer, action):
+        result = tf.py_func(self.DND.query,
+                            [input_layer, action, self.number_of_nn],
+                            [tf.float64, tf.float64, tf.int64])
+        self.dnd_embeddings[action] = tf.to_float(result[0])
+        self.dnd_values[action] = tf.to_float(result[1])
+        self.dnd_indices[action] = result[2]
+
+        # DND calculation
+        square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
+        distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
+        self.dnd_distances[action] = distances
+        weights = 1.0 / distances
+        normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
+        q_value = tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
+        q_value.set_shape((None,))
+        return q_value
+
+    def _post_build(self):
+        # DND gradients
+        self.dnd_embeddings_grad = tf.gradients(self.loss[0], self.dnd_embeddings)
+        self.dnd_values_grad = tf.gradients(self.loss[0], self.dnd_values)
--- a/rl_coach/architectures/tensorflow_components/heads/dueling_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/dueling_q_head.py
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
+
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
+from rl_coach.spaces import SpacesDefinition
+
+
+class DuelingQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='dueling_q_head_params'):
+        super().__init__(parameterized_class=DuelingQHead, activation_function=activation_function, name=name)
+
+
+class DuelingQHead(QHead):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'dueling_q_values_head'
+
+    def _build_module(self, input_layer):
+        # state value tower - V
+        with tf.variable_scope("state_value"):
+            state_value = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
+            state_value = tf.layers.dense(state_value, 1, name='fc2')
+            # state_value = tf.expand_dims(state_value, axis=-1)
+
+        # action advantage tower - A
+        with tf.variable_scope("action_advantage"):
+            action_advantage = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
+            action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2')
+            action_advantage = action_advantage - tf.reduce_mean(action_advantage)
+
+        # merge to state-action value function Q
+        self.output = tf.add(state_value, action_advantage, name='output')
--- a/rl_coach/architectures/tensorflow_components/heads/head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/head.py
@@ -0,0 +1,165 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Type
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters, Parameters
+from rl_coach.spaces import SpacesDefinition
+from tensorflow.python.ops.losses.losses_impl import Reduction
+
+from rl_coach.utils import force_list
+
+
+# Used to initialize weights for policy and value output layers
+def normalized_columns_initializer(std=1.0):
+    def _initializer(shape, dtype=None, partition_info=None):
+        out = np.random.randn(*shape).astype(np.float32)
+        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
+        return tf.constant(out)
+    return _initializer
+
+
+class HeadParameters(Parameters):
+    def __init__(self, parameterized_class: Type['Head'], activation_function: str = 'relu', name: str= 'head'):
+        super().__init__()
+        self.activation_function = activation_function
+        self.name = name
+        self.parameterized_class_name = parameterized_class.__name__
+
+
+class Head(object):
+    """
+    A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through
+    a neural network to produce the output of the network. There can be multiple heads in a network, and each one has
+    an assigned loss function. The heads are algorithm dependent.
+    """
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int=0, loss_weight: float=1., is_local: bool=True, activation_function: str='relu'):
+        self.head_idx = head_idx
+        self.network_name = network_name
+        self.network_parameters = agent_parameters.network_wrappers[self.network_name]
+        self.name = "head"
+        self.output = []
+        self.loss = []
+        self.loss_type = []
+        self.regularizations = []
+        self.loss_weight = force_list(loss_weight)
+        self.target = []
+        self.importance_weight = []
+        self.input = []
+        self.is_local = is_local
+        self.ap = agent_parameters
+        self.spaces = spaces
+        self.return_type = None
+        self.activation_function = activation_function
+
+    def __call__(self, input_layer):
+        """
+        Wrapper for building the module graph including scoping and loss creation
+        :param input_layer: the input to the graph
+        :return: the output of the last layer and the target placeholder
+        """
+        with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
+            self._build_module(input_layer)
+
+            self.output = force_list(self.output)
+            self.target = force_list(self.target)
+            self.input = force_list(self.input)
+            self.loss_type = force_list(self.loss_type)
+            self.loss = force_list(self.loss)
+            self.regularizations = force_list(self.regularizations)
+            if self.is_local:
+                self.set_loss()
+            self._post_build()
+
+        if self.is_local:
+            return self.output, self.target, self.input, self.importance_weight
+        else:
+            return self.output, self.input
+
+    def _build_module(self, input_layer):
+        """
+        Builds the graph of the module
+        This method is called early on from __call__. It is expected to store the graph
+        in self.output.
+        :param input_layer: the input to the graph
+        :return: None
+        """
+        pass
+
+    def _post_build(self):
+        """
+        Optional function that allows adding any extra definitions after the head has been fully defined
+        For example, this allows doing additional calculations that are based on the loss
+        :return: None
+        """
+        pass
+
+    def get_name(self):
+        """
+        Get a formatted name for the module
+        :return: the formatted name
+        """
+        return '{}_{}'.format(self.name, self.head_idx)
+
+    def set_loss(self):
+        """
+        Creates a target placeholder and loss function for each loss_type and regularization
+        :param loss_type: a tensorflow loss function
+        :param scope: the name scope to include the tensors in
+        :return: None
+        """
+
+        # there are heads that define the loss internally, but we need to create additional placeholders for them
+        for idx in range(len(self.loss)):
+            importance_weight = tf.placeholder('float',
+                                               [None] + [1] * (len(self.target[idx].shape) - 1),
+                                               '{}_importance_weight'.format(self.get_name()))
+            self.importance_weight.append(importance_weight)
+
+        # add losses and target placeholder
+        for idx in range(len(self.loss_type)):
+            # create target placeholder
+            target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name()))
+            self.target.append(target)
+
+            # create importance sampling weights placeholder
+            num_target_dims = len(self.target[idx].shape)
+            importance_weight = tf.placeholder('float', [None] + [1] * (num_target_dims - 1),
+                                               '{}_importance_weight'.format(self.get_name()))
+            self.importance_weight.append(importance_weight)
+
+            # compute the weighted loss. importance_weight weights over the samples in the batch, while self.loss_weight
+            # weights the specific loss of this head against other losses in this head or in other heads
+            loss_weight = self.loss_weight[idx]*importance_weight
+            loss = self.loss_type[idx](self.target[-1], self.output[idx],
+                                       scope=self.get_name(), reduction=Reduction.NONE, loss_collection=None)
+
+            # the loss is first summed over each sample in the batch and then the mean over the batch is taken
+            loss = tf.reduce_mean(loss_weight*tf.reduce_sum(loss, axis=list(range(1, num_target_dims))))
+
+            # we add the loss to the losses collection and later we will extract it in general_network
+            tf.losses.add_loss(loss)
+            self.loss.append(loss)
+
+        # add regularizations
+        for regularization in self.regularizations:
+            self.loss.append(regularization)
+
+    @classmethod
+    def path(cls):
+        return cls.__class__.__name__
--- a/rl_coach/architectures/tensorflow_components/heads/measurements_prediction_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/measurements_prediction_head.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import Measurements
+
+
+class MeasurementsPredictionHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='measurements_prediction_head_params'):
+        super().__init__(parameterized_class=MeasurementsPredictionHead,
+                         activation_function=activation_function, name=name)
+
+
+class MeasurementsPredictionHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'future_measurements_head'
+        self.num_actions = len(self.spaces.action.actions)
+        self.num_measurements = self.spaces.state['measurements'].shape[0]
+        self.num_prediction_steps = agent_parameters.algorithm.num_predicted_steps_ahead
+        self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps
+        self.return_type = Measurements
+
+    def _build_module(self, input_layer):
+        # This is almost exactly the same as Dueling Network but we predict the future measurements for each action
+        # actions expectation tower (expectation stream) - E
+        with tf.variable_scope("expectation_stream"):
+            expectation_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
+            expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output')
+            expectation_stream = tf.expand_dims(expectation_stream, axis=1)
+
+        # action fine differences tower (action stream) - A
+        with tf.variable_scope("action_stream"):
+            action_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
+            action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size,
+                                            name='output')
+            action_stream = tf.reshape(action_stream,
+                                       (tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
+            action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
+
+        # merge to future measurements predictions
+        self.output = tf.add(expectation_stream, action_stream, name='output')
+        self.target = tf.placeholder(tf.float32, [None, self.num_actions, self.multi_step_measurements_size],
+                                     name="targets")
+        targets_nonan = tf.where(tf.is_nan(self.target), self.output, self.target)
+        self.loss = tf.reduce_sum(tf.reduce_mean(tf.square(targets_nonan - self.output), reduction_indices=0))
+        tf.losses.add_loss(self.loss_weight[0] * self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/naf_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/naf_head.py
@@ -0,0 +1,88 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import BoxActionSpace
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class NAFHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='naf_head_params'):
+        super().__init__(parameterized_class=NAFHead, activation_function=activation_function, name=name)
+
+
+class NAFHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True,activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        if not isinstance(self.spaces.action, BoxActionSpace):
+            raise ValueError("NAF works only for continuous action spaces (BoxActionSpace)")
+
+        self.name = 'naf_q_values_head'
+        self.num_actions = self.spaces.action.shape[0]
+        self.output_scale = self.spaces.action.max_abs_range
+        self.return_type = QActionStateValue
+        if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
+            self.loss_type = tf.losses.huber_loss
+        else:
+            self.loss_type = tf.losses.mean_squared_error
+
+    def _build_module(self, input_layer):
+        # NAF
+        self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action")
+        self.input = self.action
+
+        # V Head
+        self.V = tf.layers.dense(input_layer, 1, name='V')
+
+        # mu Head
+        mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=self.activation_function, name='mu_unscaled')
+        self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu')
+
+        # A Head
+        # l_vector is a vector that includes a lower-triangular matrix values
+        self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector')
+
+        # Convert l to a lower triangular matrix and exponentiate its diagonal
+
+        i = 0
+        columns = []
+        for col in range(self.num_actions):
+            start_row = col
+            num_non_zero_elements = self.num_actions - start_row
+            zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row])
+            diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1)
+            non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)]
+            columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1))
+            i += num_non_zero_elements
+        self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1))
+
+        # P = L*L^T
+        self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1)))
+
+        # A = -1/2 * (u - mu)^T * P * (u - mu)
+        action_diff = tf.expand_dims(self.action - self.mu, -1)
+        a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff))
+        self.A = tf.reshape(a_matrix_form, [-1, 1])
+
+        # Q Head
+        self.Q = tf.add(self.V, self.A, name='Q')
+
+        self.output = self.Q
--- a/rl_coach/architectures/tensorflow_components/heads/policy_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/policy_head.py
@@ -0,0 +1,151 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, CompoundActionSpace
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+from rl_coach.core_types import ActionProbabilities
+from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
+
+
+class PolicyHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='policy_head_params'):
+        super().__init__(parameterized_class=PolicyHead, activation_function=activation_function, name=name)
+
+
+class PolicyHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'policy_values_head'
+        self.return_type = ActionProbabilities
+        self.beta = None
+        self.action_penalty = None
+
+        self.exploration_policy = agent_parameters.exploration
+
+        # a scalar weight that penalizes low entropy values to encourage exploration
+        if hasattr(agent_parameters.algorithm, 'beta_entropy'):
+            self.beta = agent_parameters.algorithm.beta_entropy
+
+        # a scalar weight that penalizes high activation values (before the activation function) for the final layer
+        if hasattr(agent_parameters.algorithm, 'action_penalty'):
+            self.action_penalty = agent_parameters.algorithm.action_penalty
+
+    def _build_module(self, input_layer):
+        self.actions = []
+        self.input = self.actions
+        self.policy_distributions = []
+        self.output = []
+
+        action_spaces = [self.spaces.action]
+        if isinstance(self.spaces.action, CompoundActionSpace):
+            action_spaces = self.spaces.action.sub_action_spaces
+
+        # create a compound action network
+        for action_space_idx, action_space in enumerate(action_spaces):
+            with tf.variable_scope("sub_action_{}".format(action_space_idx)):
+                if isinstance(action_space, DiscreteActionSpace):
+                    # create a discrete action network (softmax probabilities output)
+                    self._build_discrete_net(input_layer, action_space)
+                elif isinstance(action_space, BoxActionSpace):
+                    # create a continuous action network (bounded mean and stdev outputs)
+                    self._build_continuous_net(input_layer, action_space)
+
+        if self.is_local:
+            # add entropy regularization
+            if self.beta:
+                self.entropy = tf.add_n([tf.reduce_mean(dist.entropy()) for dist in self.policy_distributions])
+                self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
+
+            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+            # calculate loss
+            self.action_log_probs_wrt_policy = \
+                tf.add_n([dist.log_prob(action) for dist, action in zip(self.policy_distributions, self.actions)])
+            self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
+            self.target = self.advantages
+            self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages)
+            tf.losses.add_loss(self.loss_weight[0] * self.loss)
+
+    def _build_discrete_net(self, input_layer, action_space):
+        num_actions = len(action_space.actions)
+        self.actions.append(tf.placeholder(tf.int32, [None], name="actions"))
+
+        policy_values = tf.layers.dense(input_layer, num_actions, name='fc')
+        self.policy_probs = tf.nn.softmax(policy_values, name="policy")
+
+        # define the distributions for the policy and the old policy
+        # (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
+        policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
+        self.policy_distributions.append(policy_distribution)
+        self.output.append(self.policy_probs)
+
+    def _build_continuous_net(self, input_layer, action_space):
+        num_actions = action_space.shape
+        self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))
+
+        # output activation function
+        if np.all(self.spaces.action.max_abs_range < np.inf):
+            # bounded actions
+            self.output_scale = action_space.max_abs_range
+            self.continuous_output_activation = self.activation_function
+        else:
+            # unbounded actions
+            self.output_scale = 1
+            self.continuous_output_activation = None
+
+        # mean
+        pre_activation_policy_values_mean = tf.layers.dense(input_layer, num_actions, name='fc_mean')
+        policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
+        self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
+
+        self.output.append(self.policy_mean)
+
+        # standard deviation
+        if isinstance(self.exploration_policy, ContinuousEntropyParameters):
+            # the stdev is an output of the network and uses a softplus activation as defined in A3C
+            policy_values_std = tf.layers.dense(input_layer, num_actions,
+                                                kernel_initializer=normalized_columns_initializer(0.01), name='fc_std')
+            self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps
+
+            self.output.append(self.policy_std)
+        else:
+            # the stdev is an externally given value
+            # Warning: we need to explicitly put this variable in the local variables collections, since defining
+            # it as not trainable puts it for some reason in the global variables collections. If this is not done,
+            # the variable won't be initialized and when working with multiple workers they will get stuck.
+            self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
+                                          name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])
+
+            # assign op for the policy std
+            self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
+            self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)
+
+        # define the distributions for the policy and the old policy
+        policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
+        self.policy_distributions.append(policy_distribution)
+
+        if self.is_local:
+            # add a squared penalty on the squared pre-activation features of the action
+            if self.action_penalty and self.action_penalty != 0:
+                self.regularizations += [
+                    self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
--- a/rl_coach/architectures/tensorflow_components/heads/ppo_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ppo_head.py
@@ -0,0 +1,144 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters, normalized_columns_initializer
+from rl_coach.core_types import ActionProbabilities
+
+
+class PPOHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'):
+        super().__init__(parameterized_class=PPOHead, activation_function=activation_function, name=name)
+
+
+class PPOHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'ppo_head'
+        self.return_type = ActionProbabilities
+
+        # used in regular PPO
+        self.use_kl_regularization = agent_parameters.algorithm.use_kl_regularization
+        if self.use_kl_regularization:
+            # kl coefficient and its corresponding assignment operation and placeholder
+            self.kl_coefficient = tf.Variable(agent_parameters.algorithm.initial_kl_coefficient,
+                                              trainable=False, name='kl_coefficient')
+            self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
+            self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
+            self.kl_cutoff = 2 * agent_parameters.algorithm.target_kl_divergence
+            self.high_kl_penalty_coefficient = agent_parameters.algorithm.high_kl_penalty_coefficient
+
+        self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
+        self.beta = agent_parameters.algorithm.beta_entropy
+
+    def _build_module(self, input_layer):
+        if isinstance(self.spaces.action, DiscreteActionSpace):
+            self._build_discrete_net(input_layer, self.spaces.action)
+        elif isinstance(self.spaces.action, BoxActionSpace):
+            self._build_continuous_net(input_layer, self.spaces.action)
+        else:
+            raise ValueError("only discrete or continuous action spaces are supported for PPO")
+
+        self.action_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
+        self.action_probs_wrt_old_policy = self.old_policy_distribution.log_prob(self.actions)
+        self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
+
+        # Used by regular PPO only
+        # add kl divergence regularization
+        self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(self.old_policy_distribution, self.policy_distribution))
+
+        if self.use_kl_regularization:
+            # no clipping => use kl regularization
+            self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence)
+            self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \
+                                                tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff))
+            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+        # calculate surrogate loss
+        self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
+        self.target = self.advantages
+        # action_probs_wrt_old_policy != 0 because it is e^...
+        self.likelihood_ratio = tf.exp(self.action_probs_wrt_policy - self.action_probs_wrt_old_policy)
+        if self.clip_likelihood_ratio_using_epsilon is not None:
+            self.clip_param_rescaler = tf.placeholder(tf.float32, ())
+            self.input.append(self.clip_param_rescaler)
+            max_value = 1 + self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
+            min_value = 1 - self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
+            self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value)
+            self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages,
+                                                self.clipped_likelihood_ratio * self.advantages)
+        else:
+            self.scaled_advantages = self.likelihood_ratio * self.advantages
+        # minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss)
+        self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages)
+        if self.is_local:
+            # add entropy regularization
+            if self.beta:
+                self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
+                self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization')
+                tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+        self.loss = self.surrogate_loss
+        tf.losses.add_loss(self.loss)
+
+    def _build_discrete_net(self, input_layer, action_space):
+        num_actions = len(action_space.actions)
+        self.actions = tf.placeholder(tf.int32, [None], name="actions")
+
+        self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
+        self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
+
+        # Policy Head
+        self.input = [self.actions, self.old_policy_mean]
+        policy_values = tf.layers.dense(input_layer, num_actions, name='policy_fc')
+        self.policy_mean = tf.nn.softmax(policy_values, name="policy")
+
+        # define the distributions for the policy and the old policy
+        self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean)
+        self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
+
+        self.output = self.policy_mean
+
+    def _build_continuous_net(self, input_layer, action_space):
+        num_actions = action_space.shape[0]
+        self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")
+
+        self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
+        self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
+
+        self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
+        self.policy_mean = tf.layers.dense(input_layer, num_actions, name='policy_mean',
+                                           kernel_initializer=normalized_columns_initializer(0.01))
+        if self.is_local:
+            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
+                                            collections=[tf.GraphKeys.LOCAL_VARIABLES])
+        else:
+            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')
+
+        self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')
+
+        # define the distributions for the policy and the old policy
+        self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
+        self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)
+
+        self.output = [self.policy_mean, self.policy_std]
--- a/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
+from rl_coach.core_types import ActionProbabilities
+
+
+class PPOVHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params'):
+        super().__init__(parameterized_class=PPOVHead, activation_function=activation_function, name=name)
+
+
+class PPOVHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'ppo_v_head'
+        self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
+        self.return_type = ActionProbabilities
+
+    def _build_module(self, input_layer):
+        self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
+        self.input = [self.old_policy_value]
+        self.output = tf.layers.dense(input_layer, 1, name='output',
+                                            kernel_initializer=normalized_columns_initializer(1.0))
+        self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
+
+        value_loss_1 = tf.square(self.output - self.target)
+        value_loss_2 = tf.square(self.old_policy_value +
+                                 tf.clip_by_value(self.output - self.old_policy_value,
+                                                  -self.clip_likelihood_ratio_using_epsilon,
+                                                  self.clip_likelihood_ratio_using_epsilon) - self.target)
+        self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
+        self.loss = self.vf_loss
+        tf.losses.add_loss(self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/q_head.py
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpace
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class QHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='q_head_params'):
+        super().__init__(parameterized_class=QHead, activation_function=activation_function, name=name)
+
+
+class QHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'q_values_head'
+        if isinstance(self.spaces.action, BoxActionSpace):
+            self.num_actions = 1
+        elif isinstance(self.spaces.action, DiscreteActionSpace):
+            self.num_actions = len(self.spaces.action.actions)
+        self.return_type = QActionStateValue
+        if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
+            self.loss_type = tf.losses.huber_loss
+        else:
+            self.loss_type = tf.losses.mean_squared_error
+
+    def _build_module(self, input_layer):
+        # Standard Q Network
+        self.output = tf.layers.dense(input_layer, self.num_actions, name='output')
+
+
+
--- a/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class QuantileRegressionQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params'):
+        super().__init__(parameterized_class=QuantileRegressionQHead, activation_function=activation_function,
+                         name=name)
+
+
+class QuantileRegressionQHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'quantile_regression_dqn_head'
+        self.num_actions = len(self.spaces.action.actions)
+        self.num_atoms = agent_parameters.algorithm.atoms  # we use atom / quantile interchangeably
+        self.huber_loss_interval = agent_parameters.algorithm.huber_loss_interval  # k
+        self.return_type = QActionStateValue
+
+    def _build_module(self, input_layer):
+        self.actions = tf.placeholder(tf.int32, [None, 2], name="actions")
+        self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints")
+        self.input = [self.actions, self.quantile_midpoints]
+
+        # the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
+        quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
+        quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
+        self.output = quantiles_locations
+
+        self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles")
+        self.target = self.quantiles
+
+        # only the quantiles of the taken action are taken into account
+        quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions)
+
+        # reorder the output quantiles and the target quantiles as a preparation step for calculating the loss
+        # the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles)
+        # the target quantiles vector is tiled as column of a NxN matrix
+        theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms])
+        T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1])
+        tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms])
+
+        # Huber loss of T(theta_j) - theta_i
+        error = T_theta_j - theta_i
+        abs_error = tf.abs(error)
+        quadratic = tf.minimum(abs_error, self.huber_loss_interval)
+        huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2
+
+        # Quantile Huber loss
+        quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss
+
+        # Quantile regression loss (the probability for each quantile is 1/num_quantiles)
+        quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms)
+        self.loss = quantile_regression_loss
+        tf.losses.add_loss(self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/v_head.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
+from rl_coach.core_types import VStateValue
+
+
+class VHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='v_head_params'):
+        super().__init__(parameterized_class=VHead, activation_function=activation_function, name=name)
+
+
+class VHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'v_values_head'
+        self.return_type = VStateValue
+
+        if agent_parameters.network_wrappers[self.network_name.split('/')[0]].replace_mse_with_huber_loss:
+            self.loss_type = tf.losses.huber_loss
+        else:
+            self.loss_type = tf.losses.mean_squared_error
+
+    def _build_module(self, input_layer):
+        # Standard V Network
+        self.output = tf.layers.dense(input_layer, 1, name='output',
+                                      kernel_initializer=normalized_columns_initializer(1.0))
--- a/rl_coach/architectures/tensorflow_components/middlewares/init.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/init.py
--- a/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Union, List
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
+from rl_coach.base_parameters import MiddlewareScheme
+
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout, Dense
+from rl_coach.core_types import Middleware_FC_Embedding
+
+
+class FCMiddlewareParameters(MiddlewareParameters):
+    def __init__(self, activation_function='relu',
+                 scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_fc_embedder"):
+        super().__init__(parameterized_class=FCMiddleware, activation_function=activation_function,
+                         scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
+
+
+class FCMiddleware(Middleware):
+    schemes = {
+        MiddlewareScheme.Empty:
+            [],
+
+        # ppo
+        MiddlewareScheme.Shallow:
+            [
+                Dense([64])
+            ],
+
+        # dqn
+        MiddlewareScheme.Medium:
+            [
+                Dense([512])
+            ],
+
+        MiddlewareScheme.Deep: \
+            [
+                Dense([128]),
+                Dense([128]),
+                Dense([128])
+            ]
+    }
+
+    def __init__(self, activation_function=tf.nn.relu,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_fc_embedder"):
+        super().__init__(activation_function=activation_function, batchnorm=batchnorm,
+                         dropout=dropout, scheme=scheme, name=name)
+        self.return_type = Middleware_FC_Embedding
+        self.layers = []
+
+    def _build_module(self):
+        self.layers.append(self.input)
+
+        if isinstance(self.scheme, MiddlewareScheme):
+            layers_params = FCMiddleware.schemes[self.scheme]
+        else:
+            layers_params = self.scheme
+        for idx, layer_params in enumerate(layers_params):
+            self.layers.append(
+                layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
+            )
+
+            self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
+                                                            self.activation_function, self.dropout,
+                                                            self.dropout_rate, idx))
+
+        self.output = self.layers[-1]
+
--- a/rl_coach/architectures/tensorflow_components/middlewares/lstm_middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/lstm_middleware.py
@@ -0,0 +1,113 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
+from rl_coach.base_parameters import MiddlewareScheme
+
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
+from rl_coach.core_types import Middleware_LSTM_Embedding
+
+
+class LSTMMiddlewareParameters(MiddlewareParameters):
+    def __init__(self, activation_function='relu', number_of_lstm_cells=256,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_lstm_embedder"):
+        super().__init__(parameterized_class=LSTMMiddleware, activation_function=activation_function,
+                         scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
+        self.number_of_lstm_cells = number_of_lstm_cells
+
+
+class LSTMMiddleware(Middleware):
+    schemes = {
+        MiddlewareScheme.Empty:
+            [],
+
+        # ppo
+        MiddlewareScheme.Shallow:
+            [
+                [64]
+            ],
+
+        # dqn
+        MiddlewareScheme.Medium:
+            [
+                [512]
+            ],
+
+        MiddlewareScheme.Deep: \
+            [
+                [128],
+                [128],
+                [128]
+            ]
+    }
+
+    def __init__(self, activation_function=tf.nn.relu, number_of_lstm_cells: int=256,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_lstm_embedder"):
+        super().__init__(activation_function=activation_function, batchnorm=batchnorm,
+                         dropout=dropout, scheme=scheme, name=name)
+        self.return_type = Middleware_LSTM_Embedding
+        self.number_of_lstm_cells = number_of_lstm_cells
+        self.layers = []
+
+    def _build_module(self):
+        """
+        self.state_in: tuple of placeholders containing the initial state
+        self.state_out: tuple of output state
+
+        todo: it appears that the shape of the output is batch, feature
+        the code here seems to be slicing off the first element in the batch
+        which would definitely be wrong. need to double check the shape
+        """
+
+        self.layers.append(self.input)
+
+        # optionally insert some dense layers before the LSTM
+        if isinstance(self.scheme, MiddlewareScheme):
+            layers_params = LSTMMiddleware.schemes[self.scheme]
+        else:
+            layers_params = self.scheme
+        for idx, layer_params in enumerate(layers_params):
+            self.layers.append(
+                tf.layers.dense(self.layers[-1], layer_params[0], name='fc{}'.format(idx))
+            )
+
+            self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
+                                                            self.activation_function, self.dropout,
+                                                            self.dropout_rate, idx))
+
+        # add the LSTM layer
+        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
+        self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
+        self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
+        self.state_init = [self.c_init, self.h_init]
+        self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
+        self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
+        self.state_in = (self.c_in, self.h_in)
+        rnn_in = tf.expand_dims(self.layers[-1], [0])
+        step_size = tf.shape(self.layers[-1])[:1]
+        state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
+        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
+            lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
+        lstm_c, lstm_h = lstm_state
+        self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
+        self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])
--- a/rl_coach/architectures/tensorflow_components/middlewares/middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/middleware.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Type, Union, List
+
+import tensorflow as tf
+from rl_coach.base_parameters import MiddlewareScheme, Parameters
+
+from rl_coach.core_types import MiddlewareEmbedding
+
+
+class MiddlewareParameters(Parameters):
+    def __init__(self, parameterized_class: Type['Middleware'],
+                 activation_function: str='relu', scheme: Union[List, MiddlewareScheme]=MiddlewareScheme.Medium,
+                 batchnorm: bool=False, dropout: bool=False,
+                 name='middleware'):
+        super().__init__()
+        self.activation_function = activation_function
+        self.scheme = scheme
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+        self.name = name
+        self.parameterized_class_name = parameterized_class.__name__
+
+
+class Middleware(object):
+    """
+    A middleware embedder is the middle part of the network. It takes the embeddings from the input embedders,
+    after they were aggregated in some method (for example, concatenation) and passes it through a neural network
+    which can be customizable but shared between the heads of the network
+    """
+    def __init__(self, activation_function=tf.nn.relu,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False, name="middleware_embedder"):
+        self.name = name
+        self.input = None
+        self.output = None
+        self.activation_function = activation_function
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+        self.dropout_rate = 0
+        self.scheme = scheme
+        self.return_type = MiddlewareEmbedding
+
+    def __call__(self, input_layer):
+        with tf.variable_scope(self.get_name()):
+            self.input = input_layer
+            self._build_module()
+
+        return self.input, self.output
+
+    def _build_module(self):
+        pass
+
+    def get_name(self):
+        return self.name
--- a/rl_coach/architectures/tensorflow_components/shared_variables.py
+++ b/rl_coach/architectures/tensorflow_components/shared_variables.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+
+
+class SharedRunningStats(object):
+    def __init__(self, replicated_device=None, epsilon=1e-2, name="", create_ops=True):
+        self.sess = None
+        self.name = name
+        self.replicated_device = replicated_device
+        self.epsilon = epsilon
+        self.ops_were_created = False
+        if create_ops:
+            with tf.device(replicated_device):
+                self.create_ops()
+
+    def create_ops(self, shape=[1], clip_values=None):
+        self.clip_values = clip_values
+        with tf.variable_scope(self.name):
+            self._sum = tf.get_variable(
+                dtype=tf.float64,
+                initializer=tf.constant_initializer(0.0),
+                name="running_sum", trainable=False, shape=shape, validate_shape=False,
+                collections=[tf.GraphKeys.GLOBAL_VARIABLES])
+            self._sum_squared = tf.get_variable(
+                dtype=tf.float64,
+                initializer=tf.constant_initializer(self.epsilon),
+                name="running_sum_squared", trainable=False, shape=shape, validate_shape=False,
+                collections=[tf.GraphKeys.GLOBAL_VARIABLES])
+            self._count = tf.get_variable(
+                dtype=tf.float64,
+                shape=(),
+                initializer=tf.constant_initializer(self.epsilon),
+                name="count", trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
+
+            self._shape = None
+            self._mean = tf.div(self._sum, self._count, name="mean")
+            self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean))
+                                           / tf.maximum(self._count-1, 1), self.epsilon), name="stdev")
+            self.tf_mean = tf.cast(self._mean, 'float32')
+            self.tf_std = tf.cast(self._std, 'float32')
+
+            self.new_sum = tf.placeholder(dtype=tf.float64, name='sum')
+            self.new_sum_squared = tf.placeholder(dtype=tf.float64, name='var')
+            self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
+
+            self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True)
+            self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True)
+            self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True)
+
+            self.raw_obs = tf.placeholder(dtype=tf.float64, name='raw_obs')
+            self.normalized_obs = (self.raw_obs - self._mean) / self._std
+            if self.clip_values is not None:
+                self.clipped_obs = tf.clip_by_value(self.normalized_obs, self.clip_values[0], self.clip_values[1])
+
+            self.ops_were_created = True
+
+    def set_session(self, sess):
+        self.sess = sess
+
+    def push(self, x):
+        x = x.astype('float64')
+        self.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count],
+                         feed_dict={
+                             self.new_sum: x.sum(axis=0).ravel(),
+                             self.new_sum_squared: np.square(x).sum(axis=0).ravel(),
+                             self.newcount: np.array(len(x), dtype='float64')
+                         })
+        if self._shape is None:
+            self._shape = x.shape
+
+    @property
+    def n(self):
+        return self.sess.run(self._count)
+
+    @property
+    def mean(self):
+        return self.sess.run(self._mean)
+
+    @property
+    def var(self):
+        return self.std ** 2
+
+    @property
+    def std(self):
+        return self.sess.run(self._std)
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @shape.setter
+    def shape(self, val):
+        self._shape = val
+        self.new_sum.set_shape(val)
+        self.new_sum_squared.set_shape(val)
+        self.tf_mean.set_shape(val)
+        self.tf_std.set_shape(val)
+        self._sum.set_shape(val)
+        self._sum_squared.set_shape(val)
+
+    def normalize(self, batch):
+        if self.clip_values is not None:
+            return self.sess.run(self.clipped_obs, feed_dict={self.raw_obs: batch})
+        else:
+            return self.sess.run(self.normalized_obs, feed_dict={self.raw_obs: batch})
--- a/rl_coach/base_parameters.py
+++ b/rl_coach/base_parameters.py
@@ -0,0 +1,350 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import inspect
+import json
+import os
+import sys
+import types
+from collections import OrderedDict
+from enum import Enum
+from typing import Dict, List, Union
+
+from rl_coach.core_types import TrainingSteps, EnvironmentSteps, GradientClippingMethod
+from rl_coach.filters.filter import NoInputFilter
+
+
+class Frameworks(Enum):
+    tensorflow = "TensorFlow"
+
+
+class EmbedderScheme(Enum):
+    Empty = "Empty"
+    Shallow = "Shallow"
+    Medium = "Medium"
+    Deep = "Deep"
+
+
+class MiddlewareScheme(Enum):
+    Empty = "Empty"
+    Shallow = "Shallow"
+    Medium = "Medium"
+    Deep = "Deep"
+
+
+class EmbeddingMergerType(Enum):
+    Concat = 0
+    Sum = 1
+    #ConcatDepthWise = 2
+    #Multiply = 3
+
+
+def iterable_to_items(obj):
+    if isinstance(obj, dict) or isinstance(obj, OrderedDict) or isinstance(obj, types.MappingProxyType):
+        items = obj.items()
+    elif isinstance(obj, list):
+        items = enumerate(obj)
+    else:
+        raise ValueError("The given object is not a dict or a list")
+    return items
+
+
+def unfold_dict_or_list(obj: Union[Dict, List, OrderedDict]):
+    """
+    Recursively unfolds all the parameters in dictionaries and lists
+    :param obj: a dictionary or list to unfold
+    :return: the unfolded parameters dictionary
+    """
+    parameters = OrderedDict()
+    items = iterable_to_items(obj)
+    for k, v in items:
+        if isinstance(v, dict) or isinstance(v, list) or isinstance(v, OrderedDict):
+            if 'tensorflow.' not in str(v.__class__):
+                parameters[k] = unfold_dict_or_list(v)
+        elif 'tensorflow.' in str(v.__class__):
+            parameters[k] = v
+        elif hasattr(v, '__dict__'):
+            sub_params = v.__dict__
+            if '__objclass__' not in sub_params.keys():
+                try:
+                    parameters[k] = unfold_dict_or_list(sub_params)
+                except RecursionError:
+                    parameters[k] = sub_params
+                parameters[k]['__class__'] = v.__class__.__name__
+            else:
+                # unfolding this type of object will result in infinite recursion
+                parameters[k] = sub_params
+        else:
+            parameters[k] = v
+    if not isinstance(obj, OrderedDict) and not isinstance(obj, list):
+        parameters = OrderedDict(sorted(parameters.items()))
+    return parameters
+
+
+class Parameters(object):
+    def __setattr__(self, key, value):
+        caller_name = sys._getframe(1).f_code.co_name
+
+        if caller_name != '__init__' and not hasattr(self, key):
+            raise TypeError("Parameter '{}' does not exist in {}. Parameters are only to be defined in a constructor of"
+                            " a class inheriting from Parameters. In order to explicitly register a new parameter "
+                            "outside of a constructor use register_var().".
+                            format(key, self.__class__))
+        object.__setattr__(self, key, value)
+
+    @property
+    def path(self):
+        if hasattr(self, 'parameterized_class_name'):
+            module_path = os.path.relpath(inspect.getfile(self.__class__), os.getcwd())[:-3] + '.py'
+
+            return ':'.join([module_path, self.parameterized_class_name])
+        else:
+            raise ValueError("The parameters class does not have an attached class it parameterizes. "
+                             "The self.parameterized_class_name should be set to the parameterized class.")
+
+    def register_var(self, key, value):
+        if hasattr(self, key):
+            raise TypeError("Cannot register an already existing parameter '{}'. ".format(key))
+        object.__setattr__(self, key, value)
+
+    def __str__(self):
+        result = "\"{}\" {}\n".format(self.__class__.__name__,
+                                   json.dumps(unfold_dict_or_list(self.__dict__), indent=4, default=repr))
+        return result
+
+
+class AlgorithmParameters(Parameters):
+    def __init__(self):
+        # Architecture parameters
+        self.use_accumulated_reward_as_measurement = False
+
+        # Agent parameters
+        self.num_consecutive_playing_steps = EnvironmentSteps(1)
+        self.num_consecutive_training_steps = 1  # TODO: update this to TrainingSteps
+
+        self.heatup_using_network_decisions = False
+        self.discount = 0.99
+        self.apply_gradients_every_x_episodes = 5
+        self.num_steps_between_copying_online_weights_to_target = TrainingSteps(0)
+        self.rate_for_copying_weights_to_target = 1.0
+        self.load_memory_from_file_path = None
+        self.collect_new_data = True
+
+        # HRL / HER related params
+        self.in_action_space = None
+
+        # distributed agents params
+        self.share_statistics_between_workers = True
+
+        # intrinsic reward
+        self.scale_external_reward_by_intrinsic_reward_value = False
+
+
+class PresetValidationParameters(Parameters):
+    def __init__(self):
+        super().__init__()
+
+        # setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
+        # the form of different workers starting at different times, and getting different assignments of CPU
+        # time from the OS.
+
+        # Testing parameters
+        self.test = False
+        self.min_reward_threshold = 0
+        self.max_episodes_to_achieve_reward = 1
+        self.num_workers = 1
+        self.reward_test_level = None
+        self.trace_test_levels = None
+        self.trace_max_env_steps = 5000
+
+
+class NetworkParameters(Parameters):
+    def __init__(self):
+        super().__init__()
+        self.framework = Frameworks.tensorflow
+        self.sess = None
+
+        # hardware parameters
+        self.force_cpu = False
+
+        # distributed training options
+        self.num_threads = 1
+        self.synchronize_over_num_threads = 1
+        self.distributed = False
+        self.async_training = False
+        self.shared_optimizer = True
+        self.scale_down_gradients_by_number_of_workers_for_sync_training = True
+
+        # regularization
+        self.clip_gradients = None
+        self.gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm
+        self.kl_divergence_constraint = None
+        self.l2_regularization = 0
+
+        # learning rate
+        self.learning_rate = 0.00025
+        self.learning_rate_decay_rate = 0
+        self.learning_rate_decay_steps = 0
+
+        # structure
+        self.input_embedders_parameters = []
+        self.embedding_merger_type = EmbeddingMergerType.Concat
+        self.middleware_parameters = None
+        self.heads_parameters = []
+        self.num_output_head_copies = 1
+        self.loss_weights = []
+        self.rescale_gradient_from_head_by_factor = [1]
+        self.use_separate_networks_per_head = False
+        self.optimizer_type = 'Adam'
+        self.optimizer_epsilon = 0.0001
+        self.adam_optimizer_beta1 = 0.9
+        self.adam_optimizer_beta2 = 0.99
+        self.rms_prop_optimizer_decay = 0.9
+        self.batch_size = 32
+        self.replace_mse_with_huber_loss = False
+        self.create_target_network = False
+
+        # Framework support
+        self.tensorflow_support = True
+
+
+class InputEmbedderParameters(Parameters):
+    def __init__(self, activation_function: str='relu', scheme: Union[List, EmbedderScheme]=EmbedderScheme.Medium,
+                 batchnorm: bool=False, dropout=False, name: str='embedder', input_rescaling=None, input_offset=None,
+                 input_clipping=None):
+        super().__init__()
+        self.activation_function = activation_function
+        self.scheme = scheme
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+
+        if input_rescaling is None:
+            input_rescaling = {'image': 255.0, 'vector': 1.0}
+        if input_offset is None:
+            input_offset = {'image': 0.0, 'vector': 0.0}
+
+        self.input_rescaling = input_rescaling
+        self.input_offset = input_offset
+        self.input_clipping = input_clipping
+        self.name = name
+
+    @property
+    def path(self):
+        return {
+            "image": 'image_embedder:ImageEmbedder',
+            "vector": 'vector_embedder:VectorEmbedder'
+        }
+
+
+class VisualizationParameters(Parameters):
+    def __init__(self):
+        super().__init__()
+        # Visualization parameters
+        self.print_summary = True
+        self.dump_csv = True
+        self.dump_gifs = False
+        self.dump_mp4 = False
+        self.dump_signals_to_csv_every_x_episodes = 5
+        self.dump_in_episode_signals = False
+        self.dump_parameters_documentation = True
+        self.render = False
+        self.native_rendering = False
+        self.max_fps_for_human_control = 10
+        self.tensorboard = False
+        self.video_dump_methods = []  # a list of dump methods which will be checked one after the other until the first
+                                      # dump method that returns false for should_dump()
+        self.add_rendered_image_to_env_response = False
+
+
+class AgentParameters(Parameters):
+    def __init__(self, algorithm: AlgorithmParameters, exploration: 'ExplorationParameters', memory: 'MemoryParameters',
+                 networks: Dict[str, NetworkParameters], visualization: VisualizationParameters=VisualizationParameters()):
+        """
+        :param algorithm: the algorithmic parameters
+        :param exploration: the exploration policy parameters
+        :param memory: the memory module parameters
+        :param networks: the parameters for the networks of the agent
+        :param visualization: the visualization parameters
+        """
+        super().__init__()
+        self.visualization = visualization
+        self.algorithm = algorithm
+        self.exploration = exploration
+        self.memory = memory
+        self.network_wrappers = networks
+        self.input_filter = None
+        self.output_filter = None
+        self.pre_network_filter = NoInputFilter()
+        self.full_name_id = None  # TODO: do we really want to hold this parameters here?
+        self.name = None
+        self.is_a_highest_level_agent = True
+        self.is_a_lowest_level_agent = True
+        self.task_parameters = None
+
+    @property
+    def path(self):
+        return 'rl_coach.agents.agent:Agent'
+
+
+class TaskParameters(Parameters):
+    def __init__(self, framework_type: str, evaluate_only: bool=False, use_cpu: bool=False, experiment_path=None,
+                 seed=None):
+        """
+        :param framework_type: deep learning framework type. currently only tensorflow is supported
+        :param evaluate_only: the task will be used only for evaluating the model
+        :param use_cpu: use the cpu for this task
+        :param experiment_path: the path to the directory which will store all the experiment outputs
+        :param seed: a seed to use for the random numbers generator
+        """
+        self.framework_type = framework_type
+        self.task_index = None  # TODO: not really needed
+        self.evaluate_only = evaluate_only
+        self.use_cpu = use_cpu
+        self.experiment_path = experiment_path
+        self.seed = seed
+
+
+class DistributedTaskParameters(TaskParameters):
+    def __init__(self, framework_type: str, parameters_server_hosts: str, worker_hosts: str, job_type: str,
+                 task_index: int, evaluate_only: bool=False, num_tasks: int=None,
+                 num_training_tasks: int=None, use_cpu: bool=False, experiment_path=None, dnd=None,
+                 shared_memory_scratchpad=None, seed=None):
+        """
+        :param framework_type: deep learning framework type. currently only tensorflow is supported
+        :param evaluate_only: the task will be used only for evaluating the model
+        :param parameters_server_hosts: comma-separated list of hostname:port pairs to which the parameter servers are
+                                        assigned
+        :param worker_hosts: comma-separated list of hostname:port pairs to which the workers are assigned
+        :param job_type: the job type - either ps (short for parameters server) or worker
+        :param task_index: the index of the process
+        :param num_tasks: the number of total tasks that are running (not including the parameters server)
+        :param num_training_tasks: the number of tasks that are training (not including the parameters server)
+        :param use_cpu: use the cpu for this task
+        :param experiment_path: the path to the directory which will store all the experiment outputs
+        :param dnd: an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.
+        :param seed: a seed to use for the random numbers generator
+        """
+        super().__init__(framework_type=framework_type, evaluate_only=evaluate_only, use_cpu=use_cpu,
+                         experiment_path=experiment_path, seed=seed)
+        self.parameters_server_hosts = parameters_server_hosts
+        self.worker_hosts = worker_hosts
+        self.job_type = job_type
+        self.task_index = task_index
+        self.num_tasks = num_tasks
+        self.num_training_tasks = num_training_tasks
+        self.device = None  # the replicated device which will be used for the global parameters
+        self.worker_target = None
+        self.dnd = dnd
+        self.shared_memory_scratchpad = shared_memory_scratchpad
--- a/rl_coach/coach.py
+++ b/rl_coach/coach.py
@@ -0,0 +1,402 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+sys.path.append('.')
+
+import copy
+from rl_coach.core_types import EnvironmentSteps
+import os
+from rl_coach import logger
+import traceback
+from rl_coach.logger import screen, failed_imports
+import argparse
+import atexit
+import time
+import sys
+from rl_coach.base_parameters import Frameworks, VisualizationParameters, TaskParameters, DistributedTaskParameters
+from multiprocessing import Process
+from multiprocessing.managers import BaseManager
+import subprocess
+from rl_coach.graph_managers.graph_manager import HumanPlayScheduleParameters, GraphManager
+from rl_coach.utils import list_all_presets, short_dynamic_import, get_open_port, SharedMemoryScratchPad, get_base_dir
+from rl_coach.agents.human_agent import HumanAgentParameters
+from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
+from rl_coach.environments.environment import SingleLevelSelection
+
+
+if len(set(failed_imports)) > 0:
+    screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
+
+
+def get_graph_manager_from_args(args: argparse.Namespace) -> 'GraphManager':
+    """
+    Return the graph manager according to the command line arguments given by the user
+    :param args: the arguments given by the user
+    :return: the updated graph manager
+    """
+
+    graph_manager = None
+
+    # if a preset was given we will load the graph manager for the preset
+    if args.preset is not None:
+        graph_manager = short_dynamic_import(args.preset, ignore_module_case=True)
+
+    # for human play we need to create a custom graph manager
+    if args.play:
+        env_params = short_dynamic_import(args.environment_type, ignore_module_case=True)()
+        env_params.human_control = True
+        schedule_params = HumanPlayScheduleParameters()
+        graph_manager = BasicRLGraphManager(HumanAgentParameters(), env_params, schedule_params, VisualizationParameters())
+
+    if args.level:
+        if isinstance(graph_manager.env_params.level, SingleLevelSelection):
+            graph_manager.env_params.level.select(args.level)
+        else:
+            graph_manager.env_params.level = args.level
+
+    # set the seed for the environment
+    if args.seed is not None:
+        graph_manager.env_params.seed = args.seed
+
+    # visualization
+    graph_manager.visualization_parameters.dump_gifs = graph_manager.visualization_parameters.dump_gifs or args.dump_gifs
+    graph_manager.visualization_parameters.dump_mp4 = graph_manager.visualization_parameters.dump_mp4 or args.dump_mp4
+    graph_manager.visualization_parameters.render = args.render
+    graph_manager.visualization_parameters.tensorboard = args.tensorboard
+
+    # update the custom parameters
+    if args.custom_parameter is not None:
+        unstripped_key_value_pairs = [pair.split('=') for pair in args.custom_parameter.split(';')]
+        stripped_key_value_pairs = [tuple([pair[0].strip(), pair[1].strip()]) for pair in
+                                    unstripped_key_value_pairs if len(pair) == 2]
+
+        # load custom parameters into run_dict
+        for key, value in stripped_key_value_pairs:
+            exec("graph_manager.{}={}".format(key, value))
+
+    return graph_manager
+
+
+def parse_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
+    """
+    Parse the arguments that the user entered
+    :param parser: the argparse command line parser
+    :return: the parsed arguments
+    """
+    args = parser.parse_args()
+
+    # if no arg is given
+    if len(sys.argv) == 1:
+        parser.print_help()
+        exit(0)
+
+    # list available presets
+    preset_names = list_all_presets()
+    if args.list:
+        screen.log_title("Available Presets:")
+        for preset in sorted(preset_names):
+            print(preset)
+        sys.exit(0)
+
+    # replace a short preset name with the full path
+    if args.preset is not None:
+        if args.preset.lower() in [p.lower() for p in preset_names]:
+            args.preset = "{}.py:graph_manager".format(os.path.join(get_base_dir(), 'presets', args.preset))
+        else:
+            args.preset = "{}".format(args.preset)
+
+        # verify that the preset exists
+        preset_path = args.preset.split(":")[0]
+        if not os.path.exists(preset_path):
+            screen.error("The given preset ({}) cannot be found.".format(args.preset))
+
+        # verify that the preset can be instantiated
+        try:
+            short_dynamic_import(args.preset, ignore_module_case=True)
+        except TypeError as e:
+            traceback.print_exc()
+            screen.error('Internal Error: ' + str(e) + "\n\nThe given preset ({}) cannot be instantiated."
+                         .format(args.preset))
+
+    # validate the checkpoints args
+    if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir):
+        screen.error("The requested checkpoint folder to load from does not exist.")
+
+    # no preset was given. check if the user requested to play some environment on its own
+    if args.preset is None and args.play:
+        if args.environment_type:
+            args.agent_type = 'Human'
+        else:
+            screen.error('When no preset is given for Coach to run, and the user requests human control over '
+                         'the environment, the user is expected to input the desired environment_type and level.'
+                         '\nAt least one of these parameters was not given.')
+    elif args.preset and args.play:
+        screen.error("Both the --preset and the --play flags were set. These flags can not be used together. "
+                     "For human control, please use the --play flag together with the environment type flag (-et)")
+    elif args.preset is None and not args.play:
+        screen.error("Please choose a preset using the -p flag or use the --play flag together with choosing an "
+                     "environment type (-et) in order to play the game.")
+
+    # get experiment name and path
+    args.experiment_name = logger.get_experiment_name(args.experiment_name)
+    args.experiment_path = logger.get_experiment_path(args.experiment_name)
+
+    if args.play and args.num_workers > 1:
+        screen.warning("Playing the game as a human is only available with a single worker. "
+                       "The number of workers will be reduced to 1")
+        args.num_workers = 1
+
+    args.framework = Frameworks[args.framework.lower()]
+
+    # checkpoints
+    args.save_checkpoint_dir = os.path.join(args.experiment_path, 'checkpoint') if args.save_checkpoint_secs is not None else None
+
+    return args
+
+
+def add_items_to_dict(target_dict, source_dict):
+    updated_task_parameters = copy.copy(source_dict)
+    updated_task_parameters.update(target_dict)
+    return updated_task_parameters
+
+
+def open_dashboard(experiment_path):
+    dashboard_path = 'python {}/dashboard.py'.format(get_base_dir())
+    cmd = "{} --experiment_dir {}".format(dashboard_path, experiment_path)
+    screen.log_title("Opening dashboard - experiment path: {}".format(experiment_path))
+    # subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True, executable="/bin/bash")
+    subprocess.Popen(cmd, shell=True, executable="/bin/bash")
+
+
+def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'):
+    graph_manager.create_graph(task_parameters)
+
+    # let the adventure begin
+    if task_parameters.evaluate_only:
+        graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True)
+    else:
+        graph_manager.improve()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--preset',
+                        help="(string) Name of a preset to run (class name from the 'presets' directory.)",
+                        default=None,
+                        type=str)
+    parser.add_argument('-l', '--list',
+                        help="(flag) List all available presets",
+                        action='store_true')
+    parser.add_argument('-e', '--experiment_name',
+                        help="(string) Experiment name to be used to store the results.",
+                        default='',
+                        type=str)
+    parser.add_argument('-r', '--render',
+                        help="(flag) Render environment",
+                        action='store_true')
+    parser.add_argument('-f', '--framework',
+                        help="(string) Neural network framework. Available values: tensorflow",
+                        default='tensorflow',
+                        type=str)
+    parser.add_argument('-n', '--num_workers',
+                        help="(int) Number of workers for multi-process based agents, e.g. A3C",
+                        default=1,
+                        type=int)
+    parser.add_argument('-c', '--use_cpu',
+                        help="(flag) Use only the cpu for training. If a GPU is not available, this flag will have no "
+                             "effect and the CPU will be used either way.",
+                        action='store_true')
+    parser.add_argument('-ew', '--evaluation_worker',
+                        help="(int) If multiple workers are used, add an evaluation worker as well which will "
+                             "evaluate asynchronously and independently during the training. NOTE: this worker will "
+                             "ignore the evaluation settings in the preset's ScheduleParams.",
+                        action='store_true')
+    parser.add_argument('--play',
+                        help="(flag) Play as a human by controlling the game with the keyboard. "
+                             "This option will save a replay buffer with the game play.",
+                        action='store_true')
+    parser.add_argument('--evaluate',
+                        help="(flag) Run evaluation only. This is a convenient way to disable "
+                             "training in order to evaluate an existing checkpoint.",
+                        action='store_true')
+    parser.add_argument('-v', '--verbosity',
+                        help="(flag) Sets the verbosity level of Coach print outs. Can be either low or high.",
+                        default="low",
+                        type=str)
+    parser.add_argument('-tfv', '--tf_verbosity',
+                        help="(flag) TensorFlow verbosity level",
+                        default=3,
+                        type=int)
+    parser.add_argument('-s', '--save_checkpoint_secs',
+                        help="(int) Time in seconds between saving checkpoints of the model.",
+                        default=None,
+                        type=int)
+    parser.add_argument('-crd', '--checkpoint_restore_dir',
+                        help='(string) Path to a folder containing a checkpoint to restore the model from.',
+                        type=str)
+    parser.add_argument('-dg', '--dump_gifs',
+                        help="(flag) Enable the gif saving functionality.",
+                        action='store_true')
+    parser.add_argument('-dm', '--dump_mp4',
+                        help="(flag) Enable the mp4 saving functionality.",
+                        action='store_true')
+    parser.add_argument('-at', '--agent_type',
+                        help="(string) Choose an agent type class to override on top of the selected preset. "
+                             "If no preset is defined, a preset can be set from the command-line by combining settings "
+                             "which are set by using --agent_type, --experiment_type, --environemnt_type",
+                        default=None,
+                        type=str)
+    parser.add_argument('-et', '--environment_type',
+                        help="(string) Choose an environment type class to override on top of the selected preset."
+                             "If no preset is defined, a preset can be set from the command-line by combining settings "
+                             "which are set by using --agent_type, --experiment_type, --environemnt_type",
+                        default=None,
+                        type=str)
+    parser.add_argument('-ept', '--exploration_policy_type',
+                        help="(string) Choose an exploration policy type class to override on top of the selected "
+                             "preset."
+                             "If no preset is defined, a preset can be set from the command-line by combining settings "
+                             "which are set by using --agent_type, --experiment_type, --environemnt_type"
+                        ,
+                        default=None,
+                        type=str)
+    parser.add_argument('-lvl', '--level',
+                        help="(string) Choose the level that will be played in the environment that was selected."
+                             "This value will override the level parameter in the environment class."
+                        ,
+                        default=None,
+                        type=str)
+    parser.add_argument('-cp', '--custom_parameter',
+                        help="(string) Semicolon separated parameters used to override specific parameters on top of"
+                             " the selected preset (or on top of the command-line assembled one). "
+                             "Whenever a parameter value is a string, it should be inputted as '\\\"string\\\"'. "
+                             "For ex.: "
+                             "\"visualization.render=False; num_training_iterations=500; optimizer='rmsprop'\"",
+                        default=None,
+                        type=str)
+    parser.add_argument('--print_parameters',
+                        help="(flag) Print tuning_parameters to stdout",
+                        action='store_true')
+    parser.add_argument('-tb', '--tensorboard',
+                        help="(flag) When using the TensorFlow backend, enable TensorBoard log dumps. ",
+                        action='store_true')
+    parser.add_argument('-ns', '--no_summary',
+                        help="(flag) Prevent Coach from printing a summary and asking questions at the end of runs",
+                        action='store_true')
+    parser.add_argument('-d', '--open_dashboard',
+                        help="(flag) Open dashboard with the experiment when the run starts",
+                        action='store_true')
+    parser.add_argument('--seed',
+                        help="(int) A seed to use for running the experiment",
+                        default=None,
+                        type=int)
+
+    args = parse_arguments(parser)
+
+    graph_manager = get_graph_manager_from_args(args)
+
+    # Intel optimized TF seems to run significantly faster when limiting to a single OMP thread.
+    # This will not affect GPU runs.
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    # turn TF debug prints off
+    if args.framework == Frameworks.tensorflow:
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_verbosity)
+
+    # turn off the summary at the end of the run if necessary
+    if not args.no_summary:
+        atexit.register(logger.summarize_experiment)
+        screen.change_terminal_title(args.experiment_name)
+
+    # open dashboard
+    if args.open_dashboard:
+        open_dashboard(args.experiment_path)
+
+    # Single-threaded runs
+    if args.num_workers == 1:
+        # Start the training or evaluation
+        task_parameters = TaskParameters(framework_type="tensorflow",  # TODO: tensorflow should'nt be hardcoded
+                                         evaluate_only=args.evaluate,
+                                         experiment_path=args.experiment_path,
+                                         seed=args.seed,
+                                         use_cpu=args.use_cpu)
+        task_parameters.__dict__ = add_items_to_dict(task_parameters.__dict__, args.__dict__)
+
+        start_graph(graph_manager=graph_manager, task_parameters=task_parameters)
+
+    # Multi-threaded runs
+    else:
+        total_tasks = args.num_workers
+        if args.evaluation_worker:
+            total_tasks += 1
+
+        ps_hosts = "localhost:{}".format(get_open_port())
+        worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(total_tasks)])
+
+        # Shared memory
+        class CommManager(BaseManager):
+            pass
+        CommManager.register('SharedMemoryScratchPad', SharedMemoryScratchPad, exposed=['add', 'get', 'internal_call'])
+        comm_manager = CommManager()
+        comm_manager.start()
+        shared_memory_scratchpad = comm_manager.SharedMemoryScratchPad()
+
+        def start_distributed_task(job_type, task_index, evaluation_worker=False,
+                                   shared_memory_scratchpad=shared_memory_scratchpad):
+            task_parameters = DistributedTaskParameters(framework_type="tensorflow", # TODO: tensorflow should'nt be hardcoded
+                                                        parameters_server_hosts=ps_hosts,
+                                                        worker_hosts=worker_hosts,
+                                                        job_type=job_type,
+                                                        task_index=task_index,
+                                                        evaluate_only=evaluation_worker,
+                                                        use_cpu=args.use_cpu,
+                                                        num_tasks=total_tasks,  # training tasks + 1 evaluation task
+                                                        num_training_tasks=args.num_workers,
+                                                        experiment_path=args.experiment_path,
+                                                        shared_memory_scratchpad=shared_memory_scratchpad,
+                                                        seed=args.seed+task_index if args.seed is not None else None)  # each worker gets a different seed
+            task_parameters.__dict__ = add_items_to_dict(task_parameters.__dict__, args.__dict__)
+            # we assume that only the evaluation workers are rendering
+            graph_manager.visualization_parameters.render = args.render and evaluation_worker
+            p = Process(target=start_graph, args=(graph_manager, task_parameters))
+            # p.daemon = True
+            p.start()
+            return p
+
+        # parameter server
+        parameter_server = start_distributed_task("ps", 0)
+
+        # training workers
+        # wait a bit before spawning the non chief workers in order to make sure the session is already created
+        workers = []
+        workers.append(start_distributed_task("worker", 0))
+        time.sleep(2)
+        for task_index in range(1, args.num_workers):
+            workers.append(start_distributed_task("worker", task_index))
+
+        # evaluation worker
+        if args.evaluation_worker:
+            evaluation_worker = start_distributed_task("worker", args.num_workers, evaluation_worker=True)
+
+        # wait for all workers
+        [w.join() for w in workers]
+        if args.evaluation_worker:
+            evaluation_worker.terminate()
+
+
+if __name__ == "__main__":
+    main()
--- a/rl_coach/core_types.py
+++ b/rl_coach/core_types.py
@@ -0,0 +1,687 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from enum import Enum
+from typing import List, Union, Dict, Any, Type
+from random import shuffle
+
+import numpy as np
+import copy
+
+ActionType = Union[int, float, np.ndarray, List]
+GoalType = Union[None, np.ndarray]
+ObservationType = np.ndarray
+RewardType = Union[int, float, np.ndarray]
+StateType = Dict[str, np.ndarray]
+
+
+class GoalTypes(Enum):
+    Embedding = 1
+    EmbeddingChange = 2
+    Observation = 3
+    Measurements = 4
+
+
+# step methods
+
+class StepMethod(object):
+    def __init__(self, num_steps: int):
+        self._num_steps = self.num_steps = num_steps
+
+    @property
+    def num_steps(self) -> int:
+        return self._num_steps
+
+    @num_steps.setter
+    def num_steps(self, val: int) -> None:
+        self._num_steps = val
+
+
+class Frames(StepMethod):
+    def __init__(self, num_steps):
+        super().__init__(num_steps)
+
+
+class EnvironmentSteps(StepMethod):
+    def __init__(self, num_steps):
+        super().__init__(num_steps)
+
+
+class EnvironmentEpisodes(StepMethod):
+    def __init__(self, num_steps):
+        super().__init__(num_steps)
+
+
+class TrainingSteps(StepMethod):
+    def __init__(self, num_steps):
+        super().__init__(num_steps)
+
+
+class Time(StepMethod):
+    def __init__(self, num_steps):
+        super().__init__(num_steps)
+
+
+class PredictionType(object):
+    pass
+
+
+class VStateValue(PredictionType):
+    pass
+
+
+class QActionStateValue(PredictionType):
+    pass
+
+
+class ActionProbabilities(PredictionType):
+    pass
+
+
+class Embedding(PredictionType):
+    pass
+
+
+class InputEmbedding(Embedding):
+    pass
+
+
+class MiddlewareEmbedding(Embedding):
+    pass
+
+
+class InputImageEmbedding(InputEmbedding):
+    pass
+
+
+class InputVectorEmbedding(InputEmbedding):
+    pass
+
+
+class Middleware_FC_Embedding(MiddlewareEmbedding):
+    pass
+
+
+class Middleware_LSTM_Embedding(MiddlewareEmbedding):
+    pass
+
+
+class Measurements(PredictionType):
+    pass
+
+PlayingStepsType = Union[EnvironmentSteps, EnvironmentEpisodes, Frames]
+
+
+# run phases
+class RunPhase(Enum):
+    HEATUP = "Heatup"
+    TRAIN = "Training"
+    TEST = "Testing"
+    UNDEFINED = "Undefined"
+
+
+# transitions
+
+class Transition(object):
+    def __init__(self, state: Dict[str, np.ndarray]=None, action: ActionType=None, reward: RewardType=None,
+                 next_state: Dict[str, np.ndarray]=None, game_over: bool=None, info: Dict=None):
+        """
+        A transition is a tuple containing the information of a single step of interaction
+        between the agent and the environment. The most basic version should contain the following values:
+        (current state, action, reward, next state, game over)
+        For imitation learning algorithms, if the reward, next state or game over is not known,
+        it is sufficient to store the current state and action taken by the expert.
+
+        :param state: The current state. Assumed to be a dictionary where the observation
+                      is located at state['observation']
+        :param action: The current action that was taken
+        :param reward: The reward received from the environment
+        :param next_state: The next state of the environment after applying the action.
+                           The next state should be similar to the state in its structure.
+        :param game_over: A boolean which should be True if the episode terminated after
+                          the execution of the action.
+        :param info: A dictionary containing any additional information to be stored in the transition
+        """
+
+        self._state = self.state = state
+        self._action = self.action = action
+        self._reward = self.reward = reward
+        self._total_return = self.total_return = None
+        if not next_state:
+            next_state = state
+        self._next_state = self._next_state = next_state
+        self._game_over = self.game_over = game_over
+        if info is None:
+            self.info = {}
+        else:
+            self.info = info
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+    @property
+    def state(self):
+        if self._state is None:
+            raise Exception("The state was not filled by any of the modules between the environment and the agent")
+        return self._state
+
+    @state.setter
+    def state(self, val):
+        self._state = val
+
+    @property
+    def action(self):
+        if self._action is None:
+            raise Exception("The action was not filled by any of the modules between the environment and the agent")
+        return self._action
+
+    @action.setter
+    def action(self, val):
+        self._action = val
+
+    @property
+    def reward(self):
+
+        if self._reward is None:
+            raise Exception("The reward was not filled by any of the modules between the environment and the agent")
+        return self._reward
+
+    @reward.setter
+    def reward(self, val):
+        self._reward = val
+
+    @property
+    def total_return(self):
+        if self._total_return is None:
+            raise Exception("The total_return was not filled by any of the modules between the environment and the "
+                            "agent.  Make sure that you are using an episodic experience replay.")
+        return self._total_return
+
+    @total_return.setter
+    def total_return(self, val):
+        self._total_return = val
+
+    @property
+    def game_over(self):
+        if self._game_over is None:
+            raise Exception("The done flag was not filled by any of the modules between the environment and the agent")
+        return self._game_over
+
+    @game_over.setter
+    def game_over(self, val):
+        self._game_over = val
+
+    @property
+    def next_state(self):
+        if self._next_state is None:
+            raise Exception("The next state was not filled by any of the modules between the environment and the agent")
+        return self._next_state
+
+    @next_state.setter
+    def next_state(self, val):
+        self._next_state = val
+
+    def add_info(self, new_info: Dict[str, Any]) -> None:
+        if not new_info.keys().isdisjoint(self.info.keys()):
+            raise ValueError("The new info dictionary can not be appended to the existing info dictionary since there "
+                             "are overlapping keys between the two. old keys: {}, new keys: {}"
+                             .format(self.info.keys(), new_info.keys()))
+        self.info.update(new_info)
+
+    def __copy__(self):
+        new_transition = type(self)()
+        new_transition.__dict__.update(self.__dict__)
+        new_transition.state = copy.copy(new_transition.state)
+        new_transition.next_state = copy.copy(new_transition.next_state)
+        new_transition.info = copy.copy(new_transition.info)
+        return new_transition
+
+
+class EnvResponse(object):
+    def __init__(self, next_state: Dict[str, ObservationType], reward: RewardType, game_over: bool, info: Dict=None,
+                 goal: ObservationType=None):
+        """
+        An env response is a collection containing the information returning from the environment after a single action
+        has been performed on it.
+        :param next_state: The new state that the environment has transitioned into. Assumed to be a dictionary where the
+                          observation is located at state['observation']
+        :param reward: The reward received from the environment
+        :param game_over: A boolean which should be True if the episode terminated after
+                          the execution of the action.
+        :param info: any additional info from the environment
+        :param goal: a goal defined by the environment
+        """
+        self._next_state = self.next_state = next_state
+        self._reward = self.reward = reward
+        self._game_over = self.game_over = game_over
+        self._goal = self.goal = goal
+        if info is None:
+            self.info = {}
+        else:
+            self.info = info
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+    @property
+    def next_state(self):
+        return self._next_state
+
+    @next_state.setter
+    def next_state(self, val):
+        self._next_state = val
+
+    @property
+    def reward(self):
+        return self._reward
+
+    @reward.setter
+    def reward(self, val):
+        self._reward = val
+
+    @property
+    def game_over(self):
+        return self._game_over
+
+    @game_over.setter
+    def game_over(self, val):
+        self._game_over = val
+
+    @property
+    def goal(self):
+        return self._goal
+
+    @goal.setter
+    def goal(self, val):
+        self._goal = val
+
+    def add_info(self, info: Dict[str, Any]) -> None:
+        if info.keys().isdisjoint(self.info.keys()):
+            raise ValueError("The new info dictionary can not be appended to the existing info dictionary since there"
+                             "are overlapping keys between the two")
+        self.info.update(info)
+
+
+class ActionInfo(object):
+    """
+    Action info is a class that holds an action and various additional information details about it
+    """
+    def __init__(self, action: ActionType, action_probability: float=0,
+                 action_value: float=0., state_value: float=0., max_action_value: float=None,
+                 action_intrinsic_reward: float=0):
+        """
+        :param action: the action
+        :param action_probability: the probability that the action was given when selecting it
+        :param action_value: the state-action value (Q value) of the action
+        :param state_value: the state value (V value) of the state where the action was taken
+        :param max_action_value: in case this is an action that was selected randomly, this is the value of the action
+                                 that received the maximum value. if no value is given, the action is assumed to be the
+                                 action with the maximum value
+        :param action_intrinsic_reward: can contain any intrinsic reward that the agent wants to add to this action
+                                        selection
+        """
+        self.action = action
+        self.action_probability = action_probability
+        self.action_value = action_value
+        self.state_value = state_value
+        if not max_action_value:
+            self.max_action_value = action_value
+        else:
+            self.max_action_value = max_action_value
+        self.action_intrinsic_reward = action_intrinsic_reward
+
+
+class Batch(object):
+    def __init__(self, transitions: List[Transition]):
+        """
+        A wrapper around a list of transitions that helps extracting batches of parameters from it.
+        For example, one can extract a list of states corresponding to the list of transitions.
+        The class uses lazy evaluation in order to return each of the available parameters.
+        :param transitions: a list of transitions to extract the batch from
+        """
+        self.transitions = transitions
+        self._states = {}
+        self._actions = None
+        self._rewards = None
+        self._total_returns = None
+        self._game_overs = None
+        self._next_states = {}
+        self._goals = None
+        self._info = {}
+
+    def slice(self, start, end) -> None:
+        """
+        Keep a slice from the batch and discard the rest of the batch
+        :param start: the start index in the slice
+        :param end: the end index in the slice
+        :return: None
+        """
+
+        self.transitions = self.transitions[start:end]
+        for k, v in self._states.items():
+            self._states[k] = v[start:end]
+        if self._actions is not None:
+            self._actions = self._actions[start:end]
+        if self._rewards is not None:
+            self._rewards = self._rewards[start:end]
+        if self._total_returns is not None:
+            self._total_returns = self._total_returns[start:end]
+        if self._game_overs is not None:
+            self._game_overs = self._game_overs[start:end]
+        for k, v in self._next_states.items():
+            self._next_states[k] = v[start:end]
+        if self._goals is not None:
+            self._goals = self._goals[start:end]
+        for k, v in self._info.items():
+            self._info[k] = v[start:end]
+
+    def shuffle(self) -> None:
+        """
+        Shuffle all the transitions in the batch
+        :return: None
+        """
+        batch_order = list(range(self.size))
+        shuffle(batch_order)
+        self.transitions = [self.transitions[i] for i in batch_order]
+        self._states = {}
+        self._actions = None
+        self._rewards = None
+        self._total_returns = None
+        self._game_overs = None
+        self._next_states = {}
+        self._goals = None
+        self._info = {}
+
+        # This seems to be slower
+        # for k, v in self._states.items():
+        #     self._states[k] = [v[i] for i in batch_order]
+        # if self._actions is not None:
+        #     self._actions = [self._actions[i] for i in batch_order]
+        # if self._rewards is not None:
+        #     self._rewards = [self._rewards[i] for i in batch_order]
+        # if self._total_returns is not None:
+        #     self._total_returns = [self._total_returns[i] for i in batch_order]
+        # if self._game_overs is not None:
+        #     self._game_overs = [self._game_overs[i] for i in batch_order]
+        # for k, v in self._next_states.items():
+        #     self._next_states[k] = [v[i] for i in batch_order]
+        # if self._goals is not None:
+        #     self._goals = [self._goals[i] for i in batch_order]
+        # for k, v in self._info.items():
+        #     self._info[k] = [v[i] for i in batch_order]
+
+    def states(self, fetches: List[str], expand_dims=False) -> Dict[str, np.ndarray]:
+        """
+        follow the keys in fetches to extract the corresponding items from the states in the batch
+        if these keys were not already extracted before. return only the values corresponding to those keys
+        :param fetches: the keys of the state dictionary to extract
+        :param expand_dims: add an extra dimension to each of the value batches
+        :return: a dictionary containing a batch of values correponding to each of the given fetches keys
+        """
+        current_states = {}
+        # there are cases (e.g. ddpg) where the state does not contain all the information needed for running
+        # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
+        # addition to the current_state, so that all the inputs of the network will be filled)
+        for key in set(fetches).intersection(self.transitions[0].state.keys()):
+            if key not in self._states.keys():
+                self._states[key] = np.array([np.array(transition.state[key]) for transition in self.transitions])
+            if expand_dims:
+                current_states[key] = np.expand_dims(self._states[key], -1)
+            else:
+                current_states[key] = self._states[key]
+        return current_states
+
+    def actions(self, expand_dims=False) -> np.ndarray:
+        """
+        if the actions were not converted to a batch before, extract them to a batch and then return the batch
+        :param expand_dims: add an extra dimension to the actions batch
+        :return: a numpy array containing all the actions of the batch
+        """
+        if self._actions is None:
+            self._actions = np.array([transition.action for transition in self.transitions])
+        if expand_dims:
+            return np.expand_dims(self._actions, -1)
+        return self._actions
+
+    def rewards(self, expand_dims=False) -> np.ndarray:
+        """
+        if the rewards were not converted to a batch before, extract them to a batch and then return the batch
+        :param expand_dims: add an extra dimension to the rewards batch
+        :return: a numpy array containing all the rewards of the batch
+        """
+        if self._rewards is None:
+            self._rewards = np.array([transition.reward for transition in self.transitions])
+        if expand_dims:
+            return np.expand_dims(self._rewards, -1)
+        return self._rewards
+
+    def total_returns(self, expand_dims=False) -> np.ndarray:
+        """
+        if the total_returns were not converted to a batch before, extract them to a batch and then return the batch
+        if the total return was not filled, this will raise an exception
+        :param expand_dims: add an extra dimension to the total_returns batch
+        :return: a numpy array containing all the total return values of the batch
+        """
+        if self._total_returns is None:
+            self._total_returns = np.array([transition.total_return for transition in self.transitions])
+        if expand_dims:
+            return np.expand_dims(self._total_returns, -1)
+        return self._total_returns
+
+    def game_overs(self, expand_dims=False) -> np.ndarray:
+        """
+        if the game_overs were not converted to a batch before, extract them to a batch and then return the batch
+        :param expand_dims: add an extra dimension to the game_overs batch
+        :return: a numpy array containing all the game over flags of the batch
+        """
+        if self._game_overs is None:
+            self._game_overs = np.array([transition.game_over for transition in self.transitions])
+        if expand_dims:
+            return np.expand_dims(self._game_overs, -1)
+        return self._game_overs
+
+    def next_states(self, fetches: List[str], expand_dims=False) -> Dict[str, np.ndarray]:
+        """
+        follow the keys in fetches to extract the corresponding items from the next states in the batch
+        if these keys were not already extracted before. return only the values corresponding to those keys
+        :param fetches: the keys of the state dictionary to extract
+        :param expand_dims: add an extra dimension to each of the value batches
+        :return: a dictionary containing a batch of values correponding to each of the given fetches keys
+        """
+        next_states = {}
+        # there are cases (e.g. ddpg) where the state does not contain all the information needed for running
+        # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
+        # addition to the current_state, so that all the inputs of the network will be filled)
+        for key in set(fetches).intersection(self.transitions[0].next_state.keys()):
+            if key not in self._next_states.keys():
+                self._next_states[key] = np.array([np.array(transition.next_state[key]) for transition in self.transitions])
+            if expand_dims:
+                next_states[key] = np.expand_dims(self._next_states[key], -1)
+            else:
+                next_states[key] = self._next_states[key]
+        return next_states
+
+    def goals(self, expand_dims=False) -> np.ndarray:
+        """
+        if the goals were not converted to a batch before, extract them to a batch and then return the batch
+        if the goal was not filled, this will raise an exception
+        :param expand_dims: add an extra dimension to the goals batch
+        :return: a numpy array containing all the goals of the batch
+        """
+        if self._goals is None:
+            self._goals = np.array([transition.goal for transition in self.transitions])
+        if expand_dims:
+            return np.expand_dims(self._goals, -1)
+        return self._goals
+
+    def info(self, key, expand_dims=False) -> np.ndarray:
+        """
+        if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
+        batch. if the key is not part of the keys in the info dictionary, this will raise an exception
+        :param expand_dims: add an extra dimension to the info batch
+        :return: a numpy array containing all the info values of the batch corresponding to the given key
+        """
+        if key not in self._info.keys():
+            self._info[key] = np.array([transition.info[key] for transition in self.transitions])
+        if expand_dims:
+            return np.expand_dims(self._info[key], -1)
+        return self._info[key]
+
+    @property
+    def size(self) -> int:
+        """
+        :return: the size of the batch
+        """
+        return len(self.transitions)
+
+    def __getitem__(self, key):
+        """
+        get an item from the transitions list
+        :param key: index of the transition in the batch
+        :return: the transition corresponding to the given index
+        """
+        return self.transitions[key]
+
+    def __setitem__(self, key, item):
+        """
+        set an item in the transition list
+        :param key: index of the transition in the batch
+        :param item: the transition to place in the given index
+        :return: None
+        """
+        self.transitions[key] = item
+
+
+class TotalStepsCounter(object):
+    """
+    A wrapper around a dictionary counting different StepMethods steps done.
+    """
+    def __init__(self):
+        self.counters = {
+            EnvironmentEpisodes: 0,
+            EnvironmentSteps: 0,
+            TrainingSteps: 0
+        }
+
+    def __getitem__(self, key: Type[StepMethod]) -> int:
+        """
+        get counter value
+        :param key: counter type
+        :return: the counter value
+        """
+        return self.counters[key]
+
+    def __setitem__(self, key: StepMethod, item: int) -> None:
+        """
+        set an item in the transition list
+        :param key: counter type
+        :param item: an integer representing the new counter value
+        :return: None
+        """
+        self.counters[key] = item
+
+
+class GradientClippingMethod(Enum):
+    ClipByGlobalNorm = 0
+    ClipByNorm = 1
+    ClipByValue = 2
+
+
+class Episode(object):
+    def __init__(self, discount: float=0.99, bootstrap_total_return_from_old_policy: bool=False, n_step: int=-1):
+        """
+        :param discount: the discount factor to use when calculating total returns
+        :param bootstrap_total_return_from_old_policy: should the total return be bootstrapped from the values in the
+                                                       memory
+        :param n_step: the number of future steps to sum the reward over before bootstrapping
+        """
+        self.transitions = []
+        # a num_transitions x num_transitions table with the n step return in the n'th row
+        self.returns_table = None
+        self._length = 0
+        self.discount = discount
+        self.bootstrap_total_return_from_old_policy = bootstrap_total_return_from_old_policy
+        self.n_step = n_step
+        self.is_complete = False
+
+    def insert(self, transition):
+        self.transitions.append(transition)
+        self._length += 1
+
+    def is_empty(self):
+        return self.length() == 0
+
+    def length(self):
+        return self._length
+
+    def get_transition(self, transition_idx):
+        return self.transitions[transition_idx]
+
+    def get_last_transition(self):
+        return self.get_transition(-1) if self.length() > 0 else None
+
+    def get_first_transition(self):
+        return self.get_transition(0) if self.length() > 0 else None
+
+    def update_returns(self):
+        if self.n_step == -1 or self.n_step > self.length():
+            self.n_step = self.length()
+        rewards = np.array([t.reward for t in self.transitions])
+        rewards = rewards.astype('float')
+        total_return = rewards.copy()
+        current_discount = self.discount
+        for i in range(1, self.n_step):
+            total_return += current_discount * np.pad(rewards[i:], (0, i), 'constant', constant_values=0)
+            current_discount *= self.discount
+
+        # calculate the bootstrapped returns
+        if self.bootstrap_total_return_from_old_policy:
+            bootstraps = np.array([np.squeeze(t.info['max_action_value']) for t in self.transitions[self.n_step:]])
+            bootstrapped_return = total_return + current_discount * np.pad(bootstraps, (0, self.n_step), 'constant',
+                                                                           constant_values=0)
+            total_return = bootstrapped_return
+
+        for transition_idx in range(self.length()):
+            self.transitions[transition_idx].total_return = total_return[transition_idx]
+
+    def update_actions_probabilities(self):
+        probability_product = 1
+        for transition_idx, transition in enumerate(self.transitions):
+            if 'action_probabilities' in transition.info.keys():
+                probability_product *= transition.info['action_probabilities']
+        for transition_idx, transition in enumerate(self.transitions):
+            transition.info['probability_product'] = probability_product
+
+    def get_returns_table(self):
+        return self.returns_table
+
+    def get_returns(self):
+        return self.get_transitions_attribute('total_return')
+
+    def get_transitions_attribute(self, attribute_name):
+        if len(self.transitions) > 0 and hasattr(self.transitions[0], attribute_name):
+            return [getattr(t, attribute_name) for t in self.transitions]
+        elif len(self.transitions) == 0:
+            return []
+        else:
+            raise ValueError("The transitions have no such attribute name")
+
+    def to_batch(self):
+        batch = []
+        for i in range(self.length()):
+            batch.append(self.get_transition(i))
+        return batch
--- a/rl_coach/dashboard.py
+++ b/rl_coach/dashboard.py
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+To run Coach Dashboard, run the following command:
+python3 dashboard.py
+"""
+
+import sys
+sys.path.append('.')
+
+import os
+
+from rl_coach.dashboard_components.experiment_board import display_directory_group, display_files
+from rl_coach.dashboard_components.globals import doc
+import rl_coach.dashboard_components.boards
+from rl_coach.dashboard_components.landing_page import landing_page
+
+doc.add_root(landing_page)
+
+import argparse
+import glob
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-d', '--experiment_dir',
+                    help="(string) The path of an experiment dir to open",
+                    default=None,
+                    type=str)
+parser.add_argument('-f', '--experiment_files',
+                    help="(string) The path of an experiment file to open",
+                    default=None,
+                    type=str)
+args = parser.parse_args()
+
+if args.experiment_dir:
+    doc.add_timeout_callback(lambda: display_directory_group(args.experiment_dir), 1000)
+elif args.experiment_files:
+    files = []
+    for file_pattern in args.experiment_files:
+        files.extend(glob.glob(args.experiment_files))
+    doc.add_timeout_callback(lambda: display_files(files), 1000)
+
+
+def main():
+    from rl_coach.utils import get_open_port
+
+    dashboard_path = os.path.realpath(__file__)
+    command = 'bokeh serve --show {} --port {}'.format(dashboard_path, get_open_port())
+    if args.experiment_dir or args.experiment_files:
+        command += ' --args'
+        if args.experiment_dir:
+            command += ' --experiment_dir {}'.format(args.experiment_dir)
+        if args.experiment_files:
+            command += ' --experiment_files {}'.format(args.experiment_files)
+
+    os.system(command)
+
+
+if __name__ == "__main__":
+    main()
--- a/rl_coach/dashboard_components/init.py
+++ b/rl_coach/dashboard_components/init.py
--- a/rl_coach/dashboard_components/boards.py
+++ b/rl_coach/dashboard_components/boards.py
@@ -0,0 +1,21 @@
+from bokeh.layouts import column
+from bokeh.models.widgets import Panel, Tabs
+from rl_coach.dashboard_components.experiment_board import experiment_board_layout
+from rl_coach.dashboard_components.episodic_board import episodic_board_layout
+from rl_coach.dashboard_components.globals import spinner, layouts
+from bokeh.models.widgets import Div
+
+# ---------------- Build Website Layout -------------------
+
+# title
+title = Div(text="""<h1>Coach Dashboard</h1>""")
+center = Div(text="""<style>html { padding-left: 50px; } </style>""")
+tab1 = Panel(child=experiment_board_layout, title='experiment board')
+# tab2 = Panel(child=episodic_board_layout, title='episodic board')
+# tabs = Tabs(tabs=[tab1, tab2])
+tabs = Tabs(tabs=[tab1])
+
+layout = column(title, center, tabs)
+layout = column(layout, spinner)
+
+layouts['boards'] = layout
--- a/rl_coach/dashboard_components/episodic_board.py
+++ b/rl_coach/dashboard_components/episodic_board.py
@@ -0,0 +1,99 @@
+
+from bokeh.layouts import row, column, widgetbox, Spacer
+from bokeh.models import ColumnDataSource, Range1d, LinearAxis, Legend
+from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup, Toggle
+from bokeh.plotting import figure
+from rl_coach.dashboard_components.globals import layouts, crcolor, crx, cry, color_resolution, crRGBs
+from rl_coach.dashboard_components.experiment_board import file_selection_button, files_selector_spacer, \
+    group_selection_button, unload_file_button, files_selector
+
+# ---------------- Build Website Layout -------------------
+
+# file refresh time placeholder
+refresh_info = Div(text="""""", width=210)
+
+# create figures
+plot = figure(plot_width=1200, plot_height=800,
+              tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
+              toolbar_location='above', x_axis_label='Episodes',
+              x_range=Range1d(0, 10000), y_range=Range1d(0, 100000))
+plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
+plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
+plot.yaxis[-1].visible = False
+
+# legend
+div = Div(text="""""")
+legend = widgetbox([div])
+
+bokeh_legend = Legend(
+    # items=[("12345678901234567890123456789012345678901234567890", [])],  # 50 letters
+    items=[("__________________________________________________", [])],  # 50 letters
+    location=(0, 0), orientation="vertical",
+    border_line_color="black",
+    label_text_font_size={'value': '9pt'},
+    margin=30
+)
+plot.add_layout(bokeh_legend, "right")
+
+# select file
+file_selection_button = Button(label="Select Files", button_type="success", width=120)
+# file_selection_button.on_click(load_files_group)
+
+files_selector_spacer = Spacer(width=10)
+
+group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
+# group_selection_button.on_click(load_directory_group)
+
+unload_file_button = Button(label="Unload", button_type="danger", width=50)
+# unload_file_button.on_click(unload_file)
+
+# files selection box
+files_selector = Select(title="Files:", options=[])
+# files_selector.on_change('value', change_data_selector)
+
+# data selection box
+data_selector = MultiSelect(title="Data:", options=[], size=12)
+# data_selector.on_change('value', select_data)
+
+# toggle second axis button
+toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
+# toggle_second_axis_button.on_click(toggle_second_axis)
+
+# averaging slider
+averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10)
+# averaging_slider.on_change('value', update_averaging)
+
+# color selector
+color_selector_title = Div(text="""Select Color:""")
+crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
+color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
+                        plot_width=300, plot_height=40,
+                        tools='tap')
+color_selector.axis.visible = False
+color_range = color_selector.rect(x='x', y='y', width=1, height=10,
+                                  color='crcolor', source=crsource)
+# crsource.on_change('selected', select_color)
+color_range.nonselection_glyph = color_range.glyph
+color_selector.toolbar.logo = None
+color_selector.toolbar_location = None
+
+episode_selector = MultiSelect(title="Episode:", options=['0', '1', '2', '3', '4'], size=1)
+
+online_toggle = Toggle(label="Online", button_type="success")
+
+# main layout of the document
+layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
+layout = column(layout, files_selector)
+layout = column(layout, row(refresh_info, unload_file_button))
+layout = column(layout, data_selector)
+layout = column(layout, color_selector_title)
+layout = column(layout, color_selector)
+layout = column(layout, toggle_second_axis_button)
+layout = column(layout, averaging_slider)
+layout = column(layout, episode_selector)
+layout = column(layout, online_toggle)
+layout = row(layout, plot)
+
+episodic_board_layout = layout
+
+layouts["episodic_board"] = episodic_board_layout
--- a/rl_coach/dashboard_components/experiment_board.py
+++ b/rl_coach/dashboard_components/experiment_board.py
@@ -0,0 +1,564 @@
+import copy
+import datetime
+import os
+import sys
+import time
+from itertools import cycle
+from os import listdir
+from os.path import isfile, join, isdir
+
+from bokeh.layouts import row, column, Spacer, ToolbarBox
+from bokeh.models import ColumnDataSource, Range1d, LinearAxis, Legend, \
+    WheelZoomTool, CrosshairTool, ResetTool, SaveTool, Toolbar, PanTool, BoxZoomTool, \
+    Toggle
+from bokeh.models.callbacks import CustomJS
+from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup
+from bokeh.plotting import figure
+from rl_coach.dashboard_components.globals import signals_files, x_axis_labels, x_axis_options, show_spinner, hide_spinner, \
+    dialog, FolderType, RunType, add_directory_csv_files, doc, display_boards, layouts, \
+    crcolor, crx, cry, color_resolution, crRGBs, rgb_to_hex, x_axis
+from rl_coach.dashboard_components.signals_files_group import SignalsFilesGroup
+
+from rl_coach.dashboard_components.signals_file import SignalsFile
+
+
+def update_axis_range(name, range_placeholder):
+    max_val = -float('inf')
+    min_val = float('inf')
+    selected_signal = None
+    if name in x_axis_options:
+        selected_signal = name
+    for signals_file in signals_files.values():
+        curr_min_val, curr_max_val = signals_file.get_range_of_selected_signals_on_axis(name, selected_signal)
+        max_val = max(max_val, curr_max_val)
+        min_val = min(min_val, curr_min_val)
+    if min_val != float('inf'):
+        if min_val == max_val:
+            range = 5
+        else:
+            range = max_val - min_val
+        range_placeholder.start = min_val - 0.1 * range
+        range_placeholder.end = max_val + 0.1 * range
+
+
+# update axes ranges
+def update_y_axis_ranges():
+    update_axis_range('default', plot.y_range)
+    update_axis_range('secondary', plot.extra_y_ranges['secondary'])
+
+
+def update_x_axis_ranges():
+    update_axis_range(x_axis[0], plot.x_range)
+
+
+def get_all_selected_signals():
+    signals = []
+    for signals_file in signals_files.values():
+        signals += signals_file.get_selected_signals()
+    return signals
+
+
+# update legend using the legend text dictionary
+def update_legend():
+    selected_signals = get_all_selected_signals()
+    max_line_length = 50
+    items = []
+    for signal in selected_signals:
+        side_sign = "◀" if signal.axis == 'default' else "▶"
+        signal_name = side_sign + " " + signal.full_name
+        # bokeh legend does not respect a max_width parameter so we split the text manually to lines of constant width
+        signal_name = [signal_name[n:n + max_line_length] for n in range(0, len(signal_name), max_line_length)]
+        for idx, substr in enumerate(signal_name):
+            if idx == 0:
+                lines = [signal.line]
+                if signal.show_bollinger_bands:
+                    lines.append(signal.bands)
+                items.append((substr, lines))
+            else:
+                items.append((substr, []))
+
+    if bokeh_legend.items == [] or items == [] or \
+            any([legend_item.renderers != item[1] for legend_item, item in zip(bokeh_legend.items, items)])\
+            or any([legend_item.label != item[0] for legend_item, item in zip(bokeh_legend.items, items)]):
+        bokeh_legend.items = items  # this step takes a long time because it is redrawing the plot
+
+    # the visible=false => visible=true is a hack to make the legend render again
+    bokeh_legend.visible = False
+    bokeh_legend.visible = True
+
+
+# select lines to display
+def select_data(args, old, new):
+    if selected_file is None:
+        return
+    show_spinner("Updating the signal selection...")
+    selected_signals = new
+    for signal_name in selected_file.signals.keys():
+        is_selected = signal_name in selected_signals
+        selected_file.set_signal_selection(signal_name, is_selected)
+
+    # update axes ranges
+    update_y_axis_ranges()
+    update_x_axis_ranges()
+
+    # update the legend
+    update_legend()
+
+    hide_spinner()
+
+
+# add new lines to the plot
+def plot_signals(signals_file, signals):
+    for idx, signal in enumerate(signals):
+        signal.line = plot.line('index', signal.name, source=signals_file.bokeh_source,
+                                line_color=signal.color, line_width=2)
+
+
+def open_file_dialog():
+    return dialog.getFileDialog()
+
+
+def open_directory_dialog():
+    return dialog.getDirDialog()
+
+
+# will create a group from the files
+def create_files_group_signal(files):
+    global selected_file
+    signals_file = SignalsFilesGroup(files, plot)
+
+    signals_files[signals_file.filename] = signals_file
+
+    filenames = [signals_file.filename]
+    if files_selector.options[0] == "":
+        files_selector.options = filenames
+    else:
+        files_selector.options = files_selector.options + filenames
+    files_selector.value = filenames[0]
+    selected_file = signals_file
+
+
+# load files from disk as a group
+def load_files_group():
+    show_spinner("Loading files group...")
+    files = open_file_dialog()
+    # no files selected
+    if not files or not files[0]:
+        hide_spinner()
+        return
+
+    display_boards()
+
+    if len(files) == 1:
+        create_files_signal(files)
+    else:
+        create_files_group_signal(files)
+
+    change_selected_signals_in_data_selector([""])
+    hide_spinner()
+
+
+# classify the folder as containing a single file, multiple files or only folders
+def classify_folder(dir_path):
+    files = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith('.csv')]
+    folders = [d for d in listdir(dir_path) if isdir(join(dir_path, d)) and any(f.endswith(".csv") for f in os.listdir(join(dir_path, d)))]
+    if len(files) == 1:
+        return FolderType.SINGLE_FILE
+    elif len(files) > 1:
+        return FolderType.MULTIPLE_FILES
+    elif len(folders) == 1:
+        return classify_folder(join(dir_path, folders[0]))
+    elif len(folders) > 1:
+        return FolderType.MULTIPLE_FOLDERS
+    else:
+        return FolderType.EMPTY
+
+
+# finds if this is single-threaded or multi-threaded
+def get_run_type(dir_path):
+    folder_type = classify_folder(dir_path)
+    if folder_type == FolderType.SINGLE_FILE:
+        folder_type = RunType.SINGLE_FOLDER_SINGLE_FILE
+
+    elif folder_type == FolderType.MULTIPLE_FILES:
+        folder_type = RunType.SINGLE_FOLDER_MULTIPLE_FILES
+
+    elif folder_type == FolderType.MULTIPLE_FOLDERS:
+        # folder contains sub dirs -> we assume we can classify the folder using only the first sub dir
+        sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
+
+        # checking only the first folder in the root dir for its type, since we assume that all sub dirs will share the
+        # same structure (i.e. if one is a result of multi-threaded run, so will all the other).
+        folder_type = classify_folder(os.path.join(dir_path, sub_dirs[0]))
+        if folder_type == FolderType.SINGLE_FILE:
+            folder_type = RunType.MULTIPLE_FOLDERS_SINGLE_FILES
+        elif folder_type == FolderType.MULTIPLE_FILES:
+            folder_type = RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES
+
+    return folder_type
+
+
+# create a signal file from the directory path according to the directory underlying structure
+def handle_dir(dir_path, run_type):
+    paths = add_directory_csv_files(dir_path)
+    if run_type in [RunType.SINGLE_FOLDER_MULTIPLE_FILES,
+                    RunType.MULTIPLE_FOLDERS_SINGLE_FILES]:
+        create_files_group_signal(paths)
+    elif run_type == RunType.SINGLE_FOLDER_SINGLE_FILE:
+        create_files_signal(paths, use_dir_name=True)
+    elif run_type == RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES:
+        sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
+        create_files_group_signal([os.path.join(dir_path, d) for d in sub_dirs])
+
+
+# load directory from disk as a group
+def load_directory_group():
+    show_spinner("Loading directories group...")
+    directory = open_directory_dialog()
+    # no files selected
+    if not directory:
+        hide_spinner()
+        return
+
+    display_directory_group(directory)
+
+
+def display_directory_group(directory):
+    pause_auto_update()
+
+    display_boards()
+    show_spinner("Loading directories group...")
+
+    while get_run_type(directory) == FolderType.EMPTY:
+        show_spinner("Waiting for experiment directory to get populated...")
+        sys.stdout.write("Waiting for experiment directory to get populated...\r")
+        time.sleep(10)
+
+    handle_dir(directory, get_run_type(directory))
+
+    change_selected_signals_in_data_selector([""])
+
+    resume_auto_update_according_to_toggle()
+    hide_spinner()
+
+
+def create_files_signal(files, use_dir_name=False):
+    global selected_file
+    new_signal_files = []
+    for idx, file_path in enumerate(files):
+        signals_file = SignalsFile(str(file_path), plot=plot, use_dir_name=use_dir_name)
+        signals_files[signals_file.filename] = signals_file
+        new_signal_files.append(signals_file)
+
+    filenames = [f.filename for f in new_signal_files]
+
+    if files_selector.options[0] == "":
+        files_selector.options = filenames
+    else:
+        files_selector.options = files_selector.options + filenames
+    files_selector.value = filenames[0]
+    selected_file = new_signal_files[0]
+
+
+# load files from disk
+def load_files():
+    show_spinner("Loading files...")
+    files = open_file_dialog()
+
+    # no files selected
+    if not files or not files[0]:
+        hide_spinner()
+        return
+
+    display_files(files)
+
+
+def display_files(files):
+    pause_auto_update()
+
+    display_boards()
+    show_spinner("Loading files...")
+
+    create_files_signal(files)
+
+    change_selected_signals_in_data_selector([""])
+
+    resume_auto_update_according_to_toggle()
+    hide_spinner()
+
+
+def unload_file():
+    global selected_file
+    if selected_file is None:
+        return
+    selected_file.hide_all_signals()
+    del signals_files[selected_file.filename]
+    data_selector.options = [""]
+    filenames_list = copy.copy(files_selector.options)
+    filenames_list.remove(selected_file.filename)
+    if len(filenames_list) == 0:
+        filenames_list = [""]
+    files_selector.options = filenames_list
+    filenames = cycle(filenames_list)
+    if files_selector.options[0] != "":
+        files_selector.value = next(filenames)
+    else:
+        files_selector.value = None
+
+    update_legend()
+    refresh_info.text = ""
+    if len(signals_files) == 0:
+        selected_file = None
+
+
+# reload the selected csv file
+def reload_all_files(force=False):
+    pause_auto_update()
+
+    for file_to_load in signals_files.values():
+        if force or file_to_load.file_was_modified_on_disk():
+            show_spinner("Updating files from the disk...")
+            file_to_load.load()
+            hide_spinner()
+        refresh_info.text = "Last Update: " + str(datetime.datetime.now()).split(".")[0]
+
+    resume_auto_update_according_to_toggle()
+
+
+# unselect the currently selected signals and then select the requested signals in the data selector
+def change_selected_signals_in_data_selector(selected_signals):
+    # the default bokeh way is not working due to a bug since Bokeh 0.12.6 (https://github.com/bokeh/bokeh/issues/6501)
+    # remove the data selection callback before updating the selector
+    data_selector.remove_on_change('value', select_data)
+    for value in list(data_selector.value):
+        if value in data_selector.options:
+            index = data_selector.options.index(value)
+            data_selector.options.remove(value)
+            data_selector.value.remove(value)
+            data_selector.options.insert(index, value)
+    data_selector.value = selected_signals
+    # add back the data selection callback
+    data_selector.on_change('value', select_data)
+
+
+# change data options according to the selected file
+def change_data_selector(args, old, new):
+    global selected_file
+    if new is None:
+        selected_file = None
+        return
+    show_spinner("Updating selection...")
+    selected_file = signals_files[new]
+    if isinstance(selected_file, SignalsFile):
+        group_cb.disabled = True
+    elif isinstance(selected_file, SignalsFilesGroup):
+        group_cb.disabled = False
+    data_selector.remove_on_change('value', select_data)
+    data_selector.options = sorted(list(selected_file.signals.keys()))
+    data_selector.on_change('value', select_data)
+    selected_signal_names = [s.name for s in selected_file.signals.values() if s.selected]
+    if not selected_signal_names:
+        selected_signal_names = [""]
+    change_selected_signals_in_data_selector(selected_signal_names)
+    averaging_slider.value = selected_file.signals_averaging_window
+    if len(averaging_slider_dummy_source.data['value']) > 0:
+        averaging_slider_dummy_source.data['value'][0] = selected_file.signals_averaging_window
+    group_cb.active = [0 if selected_file.show_bollinger_bands else None]
+    group_cb.active += [1 if selected_file.separate_files else None]
+    hide_spinner()
+
+
+# smooth all the signals of the selected file
+def update_averaging(args, old, new):
+    show_spinner("Smoothing the signals...")
+    # get the actual value from the dummy source
+    new = averaging_slider_dummy_source.data['value'][0]
+    selected_file.change_averaging_window(new)
+    hide_spinner()
+
+
+def change_x_axis(val):
+    global x_axis
+    show_spinner("Updating the X axis...")
+    x_axis[0] = x_axis_options[val]
+    plot.xaxis.axis_label = x_axis_labels[val]
+
+    for file_to_load in signals_files.values():
+        file_to_load.update_x_axis_index()
+        # this is needed in order to recalculate the mean of all the files
+        if isinstance(file_to_load, SignalsFilesGroup):
+            file_to_load.load()
+
+    update_axis_range(x_axis[0], plot.x_range)
+    hide_spinner()
+
+
+# move the signal between the main and secondary Y axes
+def toggle_second_axis():
+    show_spinner("Switching the Y axis...")
+    plot.yaxis[-1].visible = True
+    selected_file.toggle_y_axis()
+
+    # this is just for redrawing the signals
+    selected_file.reload_data()
+
+    update_y_axis_ranges()
+    update_legend()
+
+    hide_spinner()
+
+
+def toggle_group_property(new):
+    show_spinner("Loading...")
+
+    # toggle show / hide Bollinger bands
+    selected_file.change_bollinger_bands_state(0 in new)
+
+    # show a separate signal for each file in a group
+    selected_file.show_files_separately(1 in new)
+
+    update_legend()
+
+    hide_spinner()
+
+
+# Color selection - most of these functions are taken from bokeh examples (plotting/color_sliders.py)
+def select_color(attr, old, new):
+    show_spinner("Changing signal color...")
+    signals = selected_file.get_selected_signals()
+    for signal in signals:
+        signal.set_color(rgb_to_hex(crRGBs[new['1d']['indices'][0]]))
+    hide_spinner()
+
+
+def pause_auto_update():
+    toggle_auto_update(False)
+
+
+def resume_auto_update_according_to_toggle():
+    toggle_auto_update(auto_update_toggle_button.active)
+
+
+def toggle_auto_update(new):
+    global file_update_callback
+    if new is False and file_update_callback in doc._session_callbacks:
+        doc.remove_periodic_callback(file_update_callback)
+    elif file_update_callback not in doc._session_callbacks:
+        file_update_callback = doc.add_periodic_callback(reload_all_files, 30000)
+
+
+file_update_callback = doc.add_periodic_callback(reload_all_files, 30000)
+
+# ---------------- Build Website Layout -------------------
+
+# file refresh time placeholder
+refresh_info = Div(text="""""", width=210)
+
+# create figures
+plot = figure(plot_width=1200, plot_height=800,
+              # tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
+              toolbar_location=None, x_axis_label='Episodes',
+              x_range=Range1d(0, 10000), y_range=Range1d(0, 100000), lod_factor=1000)
+plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
+plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
+toolbar = Toolbar(tools=[PanTool(), BoxZoomTool(), WheelZoomTool(), CrosshairTool(), ResetTool(), SaveTool()])
+# plot.toolbar = toolbar
+plot.add_tools(*toolbar.tools)
+plot.yaxis[-1].visible = False
+
+bokeh_legend = Legend(
+    items=[("", [])],
+    orientation="vertical",
+    border_line_color="black",
+    label_text_font_size={'value': '9pt'},
+    click_policy='hide',
+    visible=False
+)
+bokeh_legend.label_width = 100
+plot.add_layout(bokeh_legend, "right")
+plot.y_range = Range1d(0, 100)
+plot.extra_y_ranges['secondary'] = Range1d(0, 100)
+
+# select file
+file_selection_button = Button(label="Select Files", button_type="success", width=120)
+file_selection_button.on_click(load_files_group)
+
+files_selector_spacer = Spacer(width=10)
+
+group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
+group_selection_button.on_click(load_directory_group)
+
+update_files_button = Button(label="Update Files", button_type="default", width=50)
+update_files_button.on_click(reload_all_files)
+
+auto_update_toggle_button = Toggle(label="Auto Update", button_type="default", width=50, active=True)
+auto_update_toggle_button.on_click(toggle_auto_update)
+
+unload_file_button = Button(label="Unload", button_type="danger", width=50)
+unload_file_button.on_click(unload_file)
+
+# files selection box
+files_selector = Select(title="Files:", options=[""])
+files_selector.on_change('value', change_data_selector)
+
+# data selection box
+data_selector = MultiSelect(title="Data:", options=[], size=12)
+data_selector.on_change('value', select_data)
+
+# x axis selection box
+x_axis_selector_title = Div(text="""X Axis:""", height=10)
+x_axis_selector = RadioButtonGroup(labels=x_axis_options, active=0)
+x_axis_selector.on_click(change_x_axis)
+
+# toggle second axis button
+toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
+toggle_second_axis_button.on_click(toggle_second_axis)
+
+# averaging slider
+# This data source is just used to communicate / trigger the real callback
+averaging_slider_dummy_source = ColumnDataSource(data=dict(value=[]))
+averaging_slider_dummy_source.on_change('data', update_averaging)
+averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10, callback_policy='mouseup')
+averaging_slider.callback = CustomJS(args=dict(source=averaging_slider_dummy_source), code="""
+    source.data = { value: [cb_obj.value] }
+""")
+
+# group properties checkbox
+group_cb = CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[])
+group_cb.on_click(toggle_group_property)
+
+# color selector
+color_selector_title = Div(text="""Select Color:""")
+crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
+color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
+                        plot_width=300, plot_height=40,
+                        tools='tap')
+color_selector.axis.visible = False
+color_range = color_selector.rect(x='x', y='y', width=1, height=10,
+                                  color='crcolor', source=crsource)
+crsource.on_change('selected', select_color)
+color_range.nonselection_glyph = color_range.glyph
+color_selector.toolbar.logo = None
+color_selector.toolbar_location = None
+
+# main layout of the document
+layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
+layout = column(layout, files_selector)
+layout = column(layout, row(update_files_button, Spacer(width=50), auto_update_toggle_button,
+                            Spacer(width=50), unload_file_button))
+layout = column(layout, row(refresh_info))
+layout = column(layout, data_selector)
+layout = column(layout, color_selector_title)
+layout = column(layout, color_selector)
+layout = column(layout, x_axis_selector_title)
+layout = column(layout, x_axis_selector)
+layout = column(layout, group_cb)
+layout = column(layout, toggle_second_axis_button)
+layout = column(layout, averaging_slider)
+toolbox = ToolbarBox(toolbar=toolbar, toolbar_location='above')
+panel = column(toolbox, plot)
+layout = row(layout, panel)
+
+experiment_board_layout = layout
+
+layouts["experiment_board"] = experiment_board_layout
--- a/rl_coach/dashboard_components/globals.py
+++ b/rl_coach/dashboard_components/globals.py
@@ -0,0 +1,136 @@
+import os
+from genericpath import isdir, isfile
+from os import listdir
+from os.path import join
+from enum import Enum
+from bokeh.models import Div
+from bokeh.plotting import curdoc
+import wx
+import colorsys
+
+patches = {}
+signals_files = {}
+selected_file = None
+x_axis = ['Episode #']
+x_axis_options = ['Episode #', 'Total steps', 'Wall-Clock Time']
+x_axis_labels = ['Episode #', 'Total steps (per worker)', 'Wall-Clock Time (minutes)']
+current_color = 0
+
+# spinner
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+with open(os.path.join(root_dir, 'dashboard_components/spinner.css'), 'r') as f:
+    spinner_style = """<style>{}</style>""".format(f.read())
+    spinner_html = """<ul class="spinner"><li></li><li></li><li></li><li></li>
+                      <li>
+                        <br>
+                        <span style="font-size: 24px; font-weight: bold; margin-left: -175px; width: 400px; 
+                        position: absolute; text-align: center;">
+                            {}
+                        </span>
+                      </li></ul>"""
+spinner = Div(text="""""")
+displayed_doc = "landing_page"
+layouts = {}
+
+
+def generate_color_range(N, I):
+    HSV_tuples = [(x*1.0/N, 0.5, I) for x in range(N)]
+    RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)
+    for_conversion = []
+    for RGB_tuple in RGB_tuples:
+        for_conversion.append((int(RGB_tuple[0]*255), int(RGB_tuple[1]*255), int(RGB_tuple[2]*255)))
+    hex_colors = [rgb_to_hex(RGB_tuple) for RGB_tuple in for_conversion]
+    return hex_colors, for_conversion
+
+
+# convert RGB tuple to hexadecimal code
+def rgb_to_hex(rgb):
+    return '#%02x%02x%02x' % rgb
+
+
+# convert hexadecimal to RGB tuple
+def hex_to_dec(hex):
+    red = ''.join(hex.strip('#')[0:2])
+    green = ''.join(hex.strip('#')[2:4])
+    blue = ''.join(hex.strip('#')[4:6])
+    return int(red, 16), int(green, 16), int(blue,16)
+
+
+color_resolution = 1000
+brightness = 0.75  # change to have brighter/darker colors
+crx = list(range(1, color_resolution+1))  # the resolution is 1000 colors
+cry = [5 for i in range(len(crx))]
+crcolor, crRGBs = generate_color_range(color_resolution, brightness)  # produce spectrum
+
+
+def display_boards():
+    global displayed_doc
+    if displayed_doc == "landing_page":
+        doc.remove_root(doc.roots[0])
+        doc.add_root(layouts["boards"])
+        displayed_doc = "boards"
+
+
+def show_spinner(text="Loading..."):
+    spinner.text = spinner_style + spinner_html.format(text)
+
+
+def hide_spinner():
+    spinner.text = ""
+
+
+# takes path to dir and recursively adds all it's files to paths
+def add_directory_csv_files(dir_path, paths=None):
+    if not paths:
+        paths = []
+
+    for p in listdir(dir_path):
+        path = join(dir_path, p)
+        if isdir(path):
+            # call recursively for each dir
+            paths = add_directory_csv_files(path, paths)
+        elif isfile(path) and path.endswith('.csv'):
+            # add every file to the list
+            paths.append(path)
+
+    return paths
+
+
+class DialogApp(wx.App):
+    def getFileDialog(self):
+        with wx.FileDialog(None, "Open CSV file", wildcard="CSV files (*.csv)|*.csv",
+                           style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST | wx.FD_CHANGE_DIR | wx.FD_MULTIPLE) as fileDialog:
+            if fileDialog.ShowModal() == wx.ID_CANCEL:
+                return None  # the user changed their mind
+            else:
+                # Proceed loading the file chosen by the user
+                return fileDialog.GetPaths()
+
+    def getDirDialog(self):
+        with wx.DirDialog(None, "Choose input directory", "",
+                           style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST | wx.FD_CHANGE_DIR) as dirDialog:
+            if dirDialog.ShowModal() == wx.ID_CANCEL:
+                return None  # the user changed their mind
+            else:
+                # Proceed loading the dir chosen by the user
+                return dirDialog.GetPath()
+
+
+class RunType(Enum):
+    SINGLE_FOLDER_SINGLE_FILE = 1
+    SINGLE_FOLDER_MULTIPLE_FILES = 2
+    MULTIPLE_FOLDERS_SINGLE_FILES = 3
+    MULTIPLE_FOLDERS_MULTIPLE_FILES = 4
+    UNKNOWN = 0
+
+
+class FolderType(Enum):
+    SINGLE_FILE = 1
+    MULTIPLE_FILES = 2
+    MULTIPLE_FOLDERS = 3
+    EMPTY = 4
+
+
+dialog = DialogApp()
+
+doc = curdoc()
--- a/rl_coach/dashboard_components/landing_page.py
+++ b/rl_coach/dashboard_components/landing_page.py
@@ -0,0 +1,22 @@
+from bokeh.layouts import row, column
+from bokeh.models.widgets import Div
+
+from rl_coach.dashboard_components.experiment_board import file_selection_button, group_selection_button
+from rl_coach.dashboard_components.globals import layouts
+
+# title
+title = Div(text="""<h1>Coach Dashboard</h1>""")
+
+# landing page
+landing_page_description = Div(text="""<h3>Start by selecting an experiment file or directory to open:</h3>""")
+center = Div(text="""<style>html { text-align: center; } </style>""")
+center_buttons = Div(text="""<style>.bk-root .bk-widget { margin: 0 auto; }</style>""", width=0)
+landing_page = column(center,
+                      title,
+                      landing_page_description,
+                      row(center_buttons),
+                      row(file_selection_button, sizing_mode='scale_width'),
+                      row(group_selection_button, sizing_mode='scale_width'),
+                      sizing_mode='scale_width')
+
+layouts['landing_page'] = landing_page
--- a/rl_coach/dashboard_components/signals.py
+++ b/rl_coach/dashboard_components/signals.py
@@ -0,0 +1,125 @@
+import random
+
+import numpy as np
+from bokeh.models import ColumnDataSource
+from bokeh.palettes import Dark2
+from rl_coach.dashboard_components.globals import show_spinner, hide_spinner, current_color
+from rl_coach.utils import squeeze_list
+
+
+class Signal:
+    def __init__(self, name, parent, plot):
+        self.name = name
+        self.full_name = "{}/{}".format(parent.filename, self.name)
+        self.plot = plot
+        self.selected = False
+        self.color = random.choice(Dark2[8])
+        self.line = None
+        self.scatter = None
+        self.bands = None
+        self.bokeh_source = parent.bokeh_source
+        self.min_val = 0
+        self.max_val = 0
+        self.axis = 'default'
+        self.sub_signals = []
+        for name in self.bokeh_source.data.keys():
+            if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name:
+                self.sub_signals.append(name)
+        if len(self.sub_signals) > 1:
+            self.mean_signal = squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]])
+            self.stdev_signal = squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]])
+            self.min_signal = squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]])
+            self.max_signal = squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]])
+        else:
+            self.mean_signal = squeeze_list(self.name)
+            self.stdev_signal = None
+            self.min_signal = None
+            self.max_signal = None
+        self.has_bollinger_bands = False
+        if self.mean_signal and self.stdev_signal and self.min_signal and self.max_signal:
+            self.has_bollinger_bands = True
+        self.show_bollinger_bands = False
+        self.bollinger_bands_source = None
+        self.update_range()
+
+    def set_color(self, color):
+        self.color = color
+        if self.line:
+            self.line.glyph.line_color = color
+        if self.bands:
+            self.bands.glyph.fill_color = color
+
+    def plot_line(self):
+        global current_color
+        self.set_color(Dark2[8][current_color])
+        current_color = (current_color + 1) % len(Dark2[8])
+        if self.has_bollinger_bands:
+            self.set_bands_source()
+            self.create_bands()
+        self.line = self.plot.line('index', self.mean_signal, source=self.bokeh_source,
+                                   line_color=self.color, line_width=2)
+        # self.scatter = self.plot.scatter('index', self.mean_signal, source=self.bokeh_source)
+        self.line.visible = True
+
+    def set_selected(self, val):
+        if self.selected != val:
+            self.selected = val
+            if self.line:
+                # self.set_color(Dark2[8][current_color])
+                # current_color = (current_color + 1) % len(Dark2[8])
+                self.line.visible = self.selected
+                if self.bands:
+                    self.bands.visible = self.selected and self.show_bollinger_bands
+            elif self.selected:
+                # lazy plotting - plot only when selected for the first time
+                self.plot_line()
+
+    def set_dash(self, dash):
+        self.line.glyph.line_dash = dash
+
+    def create_bands(self):
+        self.bands = self.plot.patch(x='band_x', y='band_y', source=self.bollinger_bands_source,
+                                color=self.color, fill_alpha=0.4, alpha=0.1, line_width=0)
+        self.bands.visible = self.show_bollinger_bands
+        # self.min_line = plot.line('index', self.min_signal, source=self.bokeh_source,
+        #                           line_color=self.color, line_width=3, line_dash="4 4")
+        # self.max_line = plot.line('index', self.max_signal, source=self.bokeh_source,
+        #                           line_color=self.color, line_width=3, line_dash="4 4")
+        # self.min_line.visible = self.show_bollinger_bands
+        # self.max_line.visible = self.show_bollinger_bands
+
+    def set_bands_source(self):
+        x_ticks = self.bokeh_source.data['index']
+        mean_values = self.bokeh_source.data[self.mean_signal]
+        stdev_values = self.bokeh_source.data[self.stdev_signal]
+        band_x = np.append(x_ticks, x_ticks[::-1])
+        band_y = np.append(mean_values - stdev_values, mean_values[::-1] + stdev_values[::-1])
+        source_data = {'band_x': band_x, 'band_y': band_y}
+        if self.bollinger_bands_source:
+            self.bollinger_bands_source.data = source_data
+        else:
+            self.bollinger_bands_source = ColumnDataSource(source_data)
+
+    def change_bollinger_bands_state(self, new_state):
+        self.show_bollinger_bands = new_state
+        if self.bands and self.selected:
+            self.bands.visible = new_state
+            # self.min_line.visible = new_state
+            # self.max_line.visible = new_state
+
+    def update_range(self):
+        self.min_val = np.min(self.bokeh_source.data[self.mean_signal])
+        self.max_val = np.max(self.bokeh_source.data[self.mean_signal])
+
+    def set_axis(self, axis):
+        self.axis = axis
+        if not self.line:
+            self.plot_line()
+            self.line.visible = False
+        self.line.y_range_name = axis
+
+    def toggle_axis(self):
+        if self.axis == 'default':
+            self.set_axis('secondary')
+        else:
+            self.set_axis('default')
--- a/rl_coach/dashboard_components/signals_file.py
+++ b/rl_coach/dashboard_components/signals_file.py
@@ -0,0 +1,63 @@
+import os
+from os.path import basename
+
+import pandas as pd
+from pandas.errors import EmptyDataError
+
+from rl_coach.dashboard_components.signals_file_base import SignalsFileBase
+from rl_coach.dashboard_components.globals import x_axis_options
+from rl_coach.utils import break_file_path
+
+
+class SignalsFile(SignalsFileBase):
+    def __init__(self, csv_path, load=True, plot=None, use_dir_name=False):
+        super().__init__(plot)
+        self.use_dir_name = use_dir_name
+        self.full_csv_path = csv_path
+        self.dir, self.filename, _ = break_file_path(csv_path)
+
+        if use_dir_name:
+            parent_directory_path = os.path.abspath(os.path.join(os.path.dirname(csv_path), '..'))
+            if len(os.listdir(parent_directory_path)) == 1:
+                # get the parent directory name (since the current directory is the timestamp directory)
+                self.dir = parent_directory_path
+                self.filename = basename(self.dir)
+            else:
+                # get the common directory for all the experiments
+                self.dir = os.path.dirname(csv_path)
+                self.filename = "{}/{}".format(basename(parent_directory_path), basename(self.dir))
+
+        if load:
+            self.load()
+            # this helps set the correct x axis
+            self.change_averaging_window(1, force=True)
+
+    def load_csv(self, idx=None, result=None):
+        # load csv and fix sparse data.
+        # csv can be in the middle of being written so we use try - except
+        new_csv = None
+        while new_csv is None:
+            try:
+                new_csv = pd.read_csv(self.full_csv_path)
+                break
+            except EmptyDataError:
+                new_csv = None
+                continue
+
+        new_csv['Wall-Clock Time'] /= 60.
+        new_csv = new_csv.interpolate()
+        # remove signals which don't contain any values
+        for k, v in new_csv.isna().all().items():
+            if v and k not in x_axis_options:
+                del new_csv[k]
+        new_csv.fillna(value=0, inplace=True)
+
+        self.csv = new_csv
+
+        self.last_modified = os.path.getmtime(self.full_csv_path)
+
+        if idx is not None:
+            result[idx] = (self.csv, self.last_modified)
+
+    def file_was_modified_on_disk(self):
+        return self.last_modified != os.path.getmtime(self.full_csv_path)
--- a/rl_coach/dashboard_components/signals_file_base.py
+++ b/rl_coach/dashboard_components/signals_file_base.py
@@ -0,0 +1,129 @@
+import numpy as np
+from bokeh.models import ColumnDataSource
+
+from rl_coach.dashboard_components.signals import Signal
+from rl_coach.dashboard_components.globals import x_axis, x_axis_options, show_spinner
+
+
+class SignalsFileBase:
+    def __init__(self, plot):
+        self.plot = plot
+        self.full_csv_path = ""
+        self.dir = ""
+        self.filename = ""
+        self.signals_averaging_window = 1
+        self.show_bollinger_bands = False
+        self.csv = None
+        self.bokeh_source = None
+        self.bokeh_source_orig = None
+        self.last_modified = None
+        self.signals = {}
+        self.separate_files = False
+        self.last_reload_data_fix = False
+
+    def load_csv(self):
+        pass
+
+    def update_x_axis_index(self):
+        global x_axis
+        self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis[0]]
+        self.bokeh_source.data['index'] = self.bokeh_source.data[x_axis[0]]
+
+    def toggle_y_axis(self, signal_name=None):
+        if signal_name and signal_name in self.signals.keys():
+            self.signals[signal_name].toggle_axis()
+        else:
+            for signal in self.signals.values():
+                if signal.selected:
+                    signal.toggle_axis()
+
+    def update_source_and_signals(self):
+        # create bokeh data sources
+        self.bokeh_source_orig = ColumnDataSource(self.csv)
+
+        if self.bokeh_source is None:
+            self.bokeh_source = ColumnDataSource(self.csv)
+            self.update_x_axis_index()
+        else:
+            self.update_x_axis_index()
+            # smooth the data if necessary
+            self.change_averaging_window(self.signals_averaging_window, force=True)
+
+        # create all the signals
+        if len(self.signals.keys()) == 0:
+            self.signals = {}
+            unique_signal_names = []
+            for name in self.csv.columns:
+                if len(name.split('/')) == 1:
+                    unique_signal_names.append(name)
+                else:
+                    unique_signal_names.append('/'.join(name.split('/')[:-1]))
+            unique_signal_names = list(set(unique_signal_names))
+            for signal_name in unique_signal_names:
+                self.signals[signal_name] = Signal(signal_name, self, self.plot)
+
+    def load(self):
+        self.load_csv()
+        self.update_source_and_signals()
+
+    def reload_data(self):
+        # this function is a workaround to reload the data of all the signals
+        # if the data doesn't change, bokeh does not refresh the line
+        temp_data = self.bokeh_source.data.copy()
+        for col in self.bokeh_source.data.keys():
+            if not self.last_reload_data_fix:
+                temp_data[col] = temp_data[col][:-1]
+        self.last_reload_data_fix = not self.last_reload_data_fix
+        self.bokeh_source.data = temp_data
+
+    def change_averaging_window(self, new_size, force=False, signals=None):
+        if force or self.signals_averaging_window != new_size:
+            self.signals_averaging_window = new_size
+            win = np.ones(new_size) / new_size
+            temp_data = self.bokeh_source_orig.data.copy()
+            for col in self.bokeh_source.data.keys():
+                if col == 'index' or col in x_axis_options \
+                        or (signals and not any(col in signal for signal in signals)):
+                    temp_data[col] = temp_data[col][:-new_size]
+                    continue
+                temp_data[col] = np.convolve(self.bokeh_source_orig.data[col], win, mode='same')[:-new_size]
+            self.bokeh_source.data = temp_data
+
+            # smooth bollinger bands
+            for signal in self.signals.values():
+                if signal.has_bollinger_bands:
+                    signal.set_bands_source()
+
+    def hide_all_signals(self):
+        for signal_name in self.signals.keys():
+            self.set_signal_selection(signal_name, False)
+
+    def set_signal_selection(self, signal_name, val):
+        self.signals[signal_name].set_selected(val)
+
+    def change_bollinger_bands_state(self, new_state):
+        self.show_bollinger_bands = new_state
+        for signal in self.signals.values():
+            signal.change_bollinger_bands_state(new_state)
+
+    def file_was_modified_on_disk(self):
+        pass
+
+    def get_range_of_selected_signals_on_axis(self, axis, selected_signal=None):
+        max_val = -float('inf')
+        min_val = float('inf')
+        for signal in self.signals.values():
+            if (selected_signal and signal.name == selected_signal) or (signal.selected and signal.axis == axis):
+                max_val = max(max_val, signal.max_val)
+                min_val = min(min_val, signal.min_val)
+        return min_val, max_val
+
+    def get_selected_signals(self):
+        signals = []
+        for signal in self.signals.values():
+            if signal.selected:
+                signals.append(signal)
+        return signals
+
+    def show_files_separately(self, val):
+        pass
--- a/rl_coach/dashboard_components/signals_files_group.py
+++ b/rl_coach/dashboard_components/signals_files_group.py
@@ -0,0 +1,192 @@
+import os
+from multiprocessing import Process, Manager
+from os.path import basename
+
+import pandas as pd
+from rl_coach.dashboard_components.globals import x_axis_options, add_directory_csv_files, show_spinner, x_axis
+from rl_coach.dashboard_components.signals_file_base import SignalsFileBase
+
+from rl_coach.dashboard_components.signals_file import SignalsFile
+
+
+class SignalsFilesGroup(SignalsFileBase):
+    def __init__(self, csv_paths, plot=None):
+        super().__init__(plot)
+        self.full_csv_paths = csv_paths
+        self.signals_files = []
+        if len(csv_paths) == 1 and os.path.isdir(csv_paths[0]):
+            self.signals_files = [SignalsFile(str(file), load=False, plot=plot) for file in add_directory_csv_files(csv_paths[0])]
+        else:
+            for csv_path in csv_paths:
+                if os.path.isdir(csv_path):
+                    self.signals_files.append(SignalsFilesGroup(add_directory_csv_files(csv_path), plot=plot))
+                else:
+                    self.signals_files.append(SignalsFile(str(csv_path), load=False, plot=plot))
+        parent_directory_path = os.path.abspath(os.path.join(os.path.dirname(csv_paths[0]), '..'))
+
+        if len(os.listdir(parent_directory_path)) == 1:
+            # get the parent directory name (since the current directory is the timestamp directory)
+            self.dir = parent_directory_path
+        else:
+            # get the common directory for all the experiments
+            self.dir = os.path.dirname('/'.join(os.path.commonprefix(csv_paths).split('/')[:-1]) + '/')
+
+        self.filename = '{} - Group({})'.format(basename(self.dir), len(self.signals_files))
+
+        self.signal_files_need_update = False
+
+        self.load()
+
+    def load_csv(self):
+        global x_axis
+        # load the csv's for all workers
+        processes = []
+        results = Manager().dict()
+        corrupted_files_idx = []
+        for idx, signal_file in enumerate(self.signals_files):
+            if not isinstance(signal_file, SignalsFilesGroup):
+                processes.append(Process(target=signal_file.load_csv, args=(idx, results)))
+                processes[-1].start()
+        [p.join() for p in processes]
+
+        # load csv's for SignalsFilesGroup serially for now. TODO: we should later parallelize this as well.
+        for idx, signal_file in enumerate(self.signals_files):
+            if isinstance(signal_file, SignalsFilesGroup):
+                signal_file.load_csv()
+
+        for idx, signal_file in enumerate(self.signals_files):
+            if len(list(results.keys())) > 0:
+                signal_file.csv, signal_file.last_modified = results[idx]
+            if not all(option in signal_file.csv.keys() for option in x_axis_options):
+                print("Warning: {} file seems to be corrupted and does contain the necessary columns "
+                          "and will not be rendered".format(signal_file.filename))
+                corrupted_files_idx.append(idx)
+
+        # remove corrupted worker files
+        for file_idx in corrupted_files_idx:
+            del self.signals_files[file_idx]
+
+        # get the stats of all the columns
+        if len(self.signals_files) > 1:
+            transformed_signals_files = []
+            subsampling = None
+            for idx in range(len(self.signals_files)):
+                transformed_signals_files.append(self.signals_files[idx].csv.copy(deep=True))
+
+                # change the index to be the currently selected x axis
+                transformed_signals_files[-1].index = transformed_signals_files[-1][x_axis[0]]
+
+                # remove all duplicate index rows
+                transformed_signals_files[-1] = transformed_signals_files[-1][~transformed_signals_files[-1].index.duplicated()]
+
+                # fill up missing row indices. we are going to take the mean over the group and we want to make sure
+                # the entire group has some value for every possible index.
+                num_rows = int(transformed_signals_files[-1].index.values[-1])
+                transformed_signals_files[-1] = transformed_signals_files[-1].reindex(range(num_rows))
+                transformed_signals_files[-1].interpolate(inplace=True)
+
+                # sub sample the csv to max of 5000 indices (do the same subsampling to all files)
+                if subsampling is None:
+                    subsampling = max(1, num_rows // 5000)
+                transformed_signals_files[-1] = transformed_signals_files[-1].iloc[::subsampling, :]
+
+            csv_group = pd.concat([signals_file for signals_file in transformed_signals_files])
+            columns_to_remove = [s for s in csv_group.columns if '/Stdev' in s] + \
+                                [s for s in csv_group.columns if '/Min' in s] + \
+                                [s for s in csv_group.columns if '/Max' in s]
+            for col in columns_to_remove:
+                del csv_group[col]
+            csv_group = csv_group.groupby(csv_group.index)
+            self.csv_mean = csv_group.mean()
+            self.csv_mean.columns = [s + '/Mean' for s in self.csv_mean.columns]
+            self.csv_stdev = csv_group.std()
+            self.csv_stdev.columns = [s + '/Stdev' for s in self.csv_stdev.columns]
+            self.csv_min = csv_group.min()
+            self.csv_min.columns = [s + '/Min' for s in self.csv_min.columns]
+            self.csv_max = csv_group.max()
+            self.csv_max.columns = [s + '/Max' for s in self.csv_max.columns]
+
+            # get the indices from the file with the least number of indices and which is not an evaluation worker
+            file_with_min_indices = transformed_signals_files[0]
+            for signals_file in transformed_signals_files:
+                if signals_file.shape[0] < file_with_min_indices.shape[0] and \
+                                'Training reward' in signals_file.keys():
+                    file_with_min_indices = signals_file
+            self.index_columns = file_with_min_indices[x_axis_options]
+
+            # concat the stats and the indices columns
+            num_rows = file_with_min_indices.shape[0]
+            self.csv = pd.concat([self.index_columns, self.csv_mean.head(num_rows), self.csv_stdev.head(num_rows),
+                                  self.csv_min.head(num_rows), self.csv_max.head(num_rows)], axis=1)
+
+            # remove the stat columns for the indices columns
+            columns_to_remove = [s + '/Mean' for s in x_axis_options] + \
+                                [s + '/Stdev' for s in x_axis_options] + \
+                                [s + '/Min' for s in x_axis_options] + \
+                                [s + '/Max' for s in x_axis_options]
+            for col in columns_to_remove:
+                if col in self.csv.keys():
+                    del self.csv[col]
+        else:  # This is a group of a single file
+            self.csv = self.signals_files[0].csv
+
+        # remove NaNs
+        self.csv.fillna(value=0, inplace=True)  # removing this line will make bollinger bands fail
+        for key in self.csv.keys():
+            if 'Stdev' in key and 'Evaluation' not in key:
+                self.csv[key] = self.csv[key].fillna(value=0)
+
+        self.signal_files_need_update = True
+
+    def reload_data(self):
+        SignalsFileBase.reload_data(self)
+
+    def update_x_axis_index(self):
+        SignalsFileBase.update_x_axis_index(self)
+
+        # update the x axis for the bollinger bands
+        for signal in self.signals.values():
+            if signal.has_bollinger_bands:
+                signal.set_bands_source()
+
+    def toggle_y_axis(self, signal_name=None):
+        for signal in self.signals.values():
+            if signal.selected:
+                signal.toggle_axis()
+
+    def change_averaging_window(self, new_size, force=False, signals=None):
+        SignalsFileBase.change_averaging_window(self, new_size, force, signals)
+
+    def set_signal_selection(self, signal_name, val):
+        self.show_files_separately(self.separate_files)
+        SignalsFileBase.set_signal_selection(self, signal_name, val)
+
+    def file_was_modified_on_disk(self):
+        for signal_file in self.signals_files:
+            if signal_file.file_was_modified_on_disk():
+                return True
+        return False
+
+    def show_files_separately(self, val):
+        self.separate_files = val
+
+        # lazy updating of the signals of each of the workers
+        if self.separate_files and self.signal_files_need_update:
+            for signal_file in self.signals_files:
+                signal_file.update_source_and_signals()
+            self.signal_files_need_update = False
+
+        for signal in self.signals.values():
+            if signal.selected:
+                if val:
+                    signal.set_dash("4 4")
+                else:
+                    signal.set_dash("")
+            for signal_file in self.signals_files:
+                try:
+                    if val:
+                        signal_file.set_signal_selection(signal.name, signal.selected)
+                    else:
+                        signal_file.set_signal_selection(signal.name, False)
+                except:
+                    pass
--- a/rl_coach/dashboard_components/spinner.css
+++ b/rl_coach/dashboard_components/spinner.css
@@ -0,0 +1,219 @@
+/* based on https://codepen.io/widmr/pen/tklqx by Anreas Widmer */
+
+.spinner {
+    font-size: 80px;
+    width: 1em;
+    height: 1em;
+    position: fixed;
+    left: 40%;
+    top: 20%;
+    z-index: 9999;
+    margin: 100px auto;
+    border-radius: 50%;
+    list-style: none;
+    }
+
+.spinner li {
+    position: absolute;
+    width: .2em;
+    height: .2em;
+    border-radius: 50%;
+}
+
+.spinner li:nth-child(1) {
+    left: 50%;
+    top: 0;
+    margin: 0 0 0 -.1em;
+    background: #00C176;
+    -webkit-transform-origin: 50% 250%;
+    -moz-transform-origin: 50% 250%;
+    -ms-transform-origin: 50% 250%;
+    -o-transform-origin: 50% 250%;
+    transform-origin: 50% 250%;
+    -webkit-animation:
+        rota 1.13s linear infinite,
+        opa 3.67s ease-in-out infinite alternate;
+    -moz-animation:
+        rota 1.13s linear infinite,
+        opa 3.67s ease-in-out infinite alternate;
+    -ms-animation:
+        rota 1.13s linear infinite,
+        opa 3.67s ease-in-out infinite alternate;
+    -o-animation:
+        rota 1.13s linear infinite,
+        opa 3.67s ease-in-out infinite alternate;
+    animation:
+        rota 1.13s linear infinite,
+        opa 3.67s ease-in-out infinite alternate;
+}
+
+.spinner li:nth-child(2) {
+    top: 50%;
+    right: 0;
+    margin: -.1em 0 0 0;
+    background: #FF003C;
+    -webkit-transform-origin: -150% 50%;
+    -moz-transform-origin: -150% 50%;
+    -ms-transform-origin: -150% 50%;
+    -o-transform-origin: -150% 50%;
+    transform-origin: -150% 50%;
+    -webkit-animation:
+        rota 1.86s linear infinite,
+        opa 4.29s ease-in-out infinite alternate;
+    -moz-animation:
+        rota 1.86s linear infinite,
+        opa 4.29s ease-in-out infinite alternate;
+    -ms-animation:
+        rota 1.86s linear infinite,
+        opa 4.29s ease-in-out infinite alternate;
+    -o-animation:
+        rota 1.86s linear infinite,
+        opa 4.29s ease-in-out infinite alternate;
+    animation:
+        rota 1.86s linear infinite,
+        opa 4.29s ease-in-out infinite alternate;
+}
+
+.spinner li:nth-child(3) {
+    left: 50%;
+    bottom: 0;
+    margin: 0 0 0 -.1em;
+    background: #FABE28;
+    -webkit-transform-origin: 50% -150%;
+    -moz-transform-origin: 50% -150%;
+    -ms-transform-origin: 50% -150%;
+    -o-transform-origin: 50% -150%;
+    transform-origin: 50% -150%;
+    -webkit-animation:
+        rota 1.45s linear infinite,
+        opa 5.12s ease-in-out infinite alternate;
+    -moz-animation:
+        rota 1.45s linear infinite,
+        opa 5.12s ease-in-out infinite alternate;
+    -ms-animation:
+        rota 1.45s linear infinite,
+        opa 5.12s ease-in-out infinite alternate;
+    -o-animation:
+        rota 1.45s linear infinite,
+        opa 5.12s ease-in-out infinite alternate;
+    animation:
+        rota 1.45s linear infinite,
+        opa 5.12s ease-in-out infinite alternate;
+}
+
+.spinner li:nth-child(4) {
+    top: 50%;
+    left 0;
+    margin: -.1em 0 0 0;
+    background: #88C100;
+    -webkit-transform-origin: 250% 50%;
+    -moz-transform-origin: 250% 50%;
+    -ms-transform-origin: 250% 50%;
+    -o-transform-origin: 250% 50%;
+    transform-origin: 250% 50%;
+    -webkit-animation:
+        rota 1.72s linear infinite,
+        opa 5.25s ease-in-out infinite alternate;
+    -moz-animation:
+        rota 1.72s linear infinite,
+        opa 5.25s ease-in-out infinite alternate;
+    -ms-animation:
+        rota 1.72s linear infinite,
+        opa 5.25s ease-in-out infinite alternate;
+    -o-animation:
+        rota 1.72s linear infinite,
+        opa 5.25s ease-in-out infinite alternate;
+    animation:
+        rota 1.72s linear infinite,
+        opa 5.25s ease-in-out infinite alternate;
+}
+
+@-webkit-keyframes rota {
+    to { -webkit-transform: rotate(360deg); }
+}
+
+@-moz-keyframes rota {
+    to { -moz-transform: rotate(360deg); }
+}
+
+@-ms-keyframes rota {
+    to { -ms-transform: rotate(360deg); }
+}
+
+@-o-keyframes rota {
+    to { -o-transform: rotate(360deg); }
+}
+
+@keyframes rota {
+    to { transform: rotate(360deg); }
+}
+
+@-webkit-keyframes opa {
+    12.0% { opacity: 0.80; }
+    19.5% { opacity: 0.88; }
+    37.2% { opacity: 0.64; }
+    40.5% { opacity: 0.52; }
+    52.7% { opacity: 0.69; }
+    60.2% { opacity: 0.60; }
+    66.6% { opacity: 0.52; }
+    70.0% { opacity: 0.63; }
+    79.9% { opacity: 0.60; }
+    84.2% { opacity: 0.75; }
+    91.0% { opacity: 0.87; }
+}
+
+@-moz-keyframes opa {
+    12.0% { opacity: 0.80; }
+    19.5% { opacity: 0.88; }
+    37.2% { opacity: 0.64; }
+    40.5% { opacity: 0.52; }
+    52.7% { opacity: 0.69; }
+    60.2% { opacity: 0.60; }
+    66.6% { opacity: 0.52; }
+    70.0% { opacity: 0.63; }
+    79.9% { opacity: 0.60; }
+    84.2% { opacity: 0.75; }
+    91.0% { opacity: 0.87; }
+}
+
+@-ms-keyframes opa {
+    12.0% { opacity: 0.80; }
+    19.5% { opacity: 0.88; }
+    37.2% { opacity: 0.64; }
+    40.5% { opacity: 0.52; }
+    52.7% { opacity: 0.69; }
+    60.2% { opacity: 0.60; }
+    66.6% { opacity: 0.52; }
+    70.0% { opacity: 0.63; }
+    79.9% { opacity: 0.60; }
+    84.2% { opacity: 0.75; }
+    91.0% { opacity: 0.87; }
+}
+
+@-o-keyframes opa {
+    12.0% { opacity: 0.80; }
+    19.5% { opacity: 0.88; }
+    37.2% { opacity: 0.64; }
+    40.5% { opacity: 0.52; }
+    52.7% { opacity: 0.69; }
+    60.2% { opacity: 0.60; }
+    66.6% { opacity: 0.52; }
+    70.0% { opacity: 0.63; }
+    79.9% { opacity: 0.60; }
+    84.2% { opacity: 0.75; }
+    91.0% { opacity: 0.87; }
+}
+
+@keyframes opa {
+    12.0% { opacity: 0.80; }
+    19.5% { opacity: 0.88; }
+    37.2% { opacity: 0.64; }
+    40.5% { opacity: 0.52; }
+    52.7% { opacity: 0.69; }
+    60.2% { opacity: 0.60; }
+    66.6% { opacity: 0.52; }
+    70.0% { opacity: 0.63; }
+    79.9% { opacity: 0.60; }
+    84.2% { opacity: 0.75; }
+    91.0% { opacity: 0.87; }
+}
--- a/rl_coach/datasets/README.md
+++ b/rl_coach/datasets/README.md
--- a/rl_coach/datasets/doom_basic.tar.gz
+++ b/rl_coach/datasets/doom_basic.tar.gz
--- a/rl_coach/datasets/montezuma_revenge.tar.gz
+++ b/rl_coach/datasets/montezuma_revenge.tar.gz
--- a/rl_coach/debug_utils.py
+++ b/rl_coach/debug_utils.py
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import math
+
+import matplotlib.pyplot as plt
+import numpy as np
+from rl_coach.filters.observation.observation_stacking_filter import LazyStack
+
+
+def show_observation_stack(stack, channels_last=True, show=True, force_num_rows=None, row_to_update=0):
+    if isinstance(stack, LazyStack):
+        stack = np.array(stack)
+    if isinstance(stack, list):  # is list
+        stack_size = len(stack)
+    elif len(stack.shape) == 3:
+        stack_size = stack.shape[0]  # is numpy array
+    elif len(stack.shape) == 4:
+        stack_size = stack.shape[1]  # ignore batch dimension
+        stack = stack[0]
+    else:
+        raise ValueError("The observation stack must be a list, a numpy array or a LazyStack object")
+
+    if channels_last:
+        stack = np.transpose(stack, (2, 0, 1))
+        stack_size = stack.shape[0]
+
+    max_cols = 10
+    if force_num_rows:
+        rows = force_num_rows
+    else:
+        rows = math.ceil(stack_size / max_cols)
+    cols = max_cols if stack_size > max_cols else stack_size
+
+    for i in range(stack_size):
+        plt.subplot(rows, cols, row_to_update * cols + i + 1)
+        plt.imshow(stack[i], cmap='gray')
+
+    if show:
+        plt.show()
+
+
+def show_diff_between_two_observations(observation1, observation2):
+    plt.imshow(observation1 - observation2, cmap='gray')
+    plt.show()
+
+
+def plot_grayscale_observation(observation):
+    plt.imshow(observation, cmap='gray')
+    plt.show()
+
+
+def plot_episode_states(episode_transitions, state_variable: str='state', observation_index_in_stack: int=0):
+    observations = []
+    for transition in episode_transitions:
+        observations.append(np.array(getattr(transition, state_variable)['observation'])[..., observation_index_in_stack])
+    show_observation_stack(observations, False)
+
+
+def plot_list_of_observation_stacks(observation_stacks):
+    for idx, stack in enumerate(observation_stacks):
+        show_observation_stack(stack['observation'], True, False,
+                               force_num_rows=len(observation_stacks), row_to_update=idx)
+    plt.show()
--- a/rl_coach/environments/CarlaSettings.ini
+++ b/rl_coach/environments/CarlaSettings.ini
@@ -0,0 +1,112 @@
+; Example of settings file for CARLA.
+;
+; This file can be loaded with the Python client to be sent to the server. It
+; defines the parameters to be used when requesting a new episode.
+;
+; Note that server specific variables are only loaded when launching the
+; simulator. Use it with `./CarlaUE4.sh -carla-settings=Path/To/This/File`.
+
+[CARLA/Server]
+; If set to false, a mock controller will be used instead of waiting for a real
+; client to connect. (Server only)
+UseNetworking=false
+; Ports to use for the server-client communication. This can be overridden by
+; the command-line switch `-world-port=N`, write and read ports will be set to
+; N+1 and N+2 respectively. (Server only)
+WorldPort=2000
+; Time-out in milliseconds for the networking operations. (Server only)
+ServerTimeOut=100000000000
+; In synchronous mode, CARLA waits every frame until the control from the client
+; is received.
+SynchronousMode=true
+; Send info about every non-player agent in the scene every frame, the
+; information is attached to the measurements message. This includes other
+; vehicles, pedestrians and traffic signs. Disabled by default to improve
+; performance.
+SendNonPlayerAgentsInfo=false
+
+[CARLA/QualitySettings]
+; Quality level of the graphics, a lower level makes the simulation run
+; considerably faster. Available: Low or Epic.
+QualityLevel=Low
+
+[CARLA/LevelSettings]
+; Path of the vehicle class to be used for the player. Leave empty for default.
+; Paths follow the pattern "/Game/Blueprints/Vehicles/Mustang/Mustang.Mustang_C"
+PlayerVehicle=
+; Number of non-player vehicles to be spawned into the level.
+NumberOfVehicles=15
+; Number of non-player pedestrians to be spawned into the level.
+NumberOfPedestrians=30
+; Index of the weather/lighting presets to use. If negative, the default presets
+; of the map will be used.
+WeatherId=1
+; Seeds for the pseudo-random number generators.
+SeedVehicles=123456789
+SeedPedestrians=123456789
+
+[CARLA/Sensor]
+; Names of the sensors to be attached to the player, comma-separated, each of
+; them should be defined in its own subsection.
+
+; Uncomment next line to add a camera called FrontCamera to the vehicle
+Sensors=FrontCamera
+
+; or uncomment next line to add a camera and a Lidar
+; Sensors=FrontCamera,MyLidar
+
+; or uncomment next line to add a regular camera and a depth camera
+; Sensors=FrontCamera,FrontCamera/Depth
+
+; Now, every camera we added needs to be defined it in its own subsection.
+[CARLA/Sensor/FrontCamera]
+; Type of the sensor. The available types are:
+;   * CAMERA                        A scene capture camera.
+;   * LIDAR_RAY_CAST                A Lidar implementation based on ray-casting.
+SensorType=CAMERA
+; Post-processing effect to be applied to this camera. Valid values:
+;   * None                  No effects applied.
+;   * SceneFinal            Post-processing present at scene (bloom, fog, etc).
+;   * Depth                 Depth map ground-truth only.
+;   * SemanticSegmentation  Semantic segmentation ground-truth only.
+PostProcessing=SceneFinal
+; Size of the captured image in pixels.
+ImageSizeX=360
+ImageSizeY=256
+; Camera (horizontal) field of view in degrees.
+FOV=90
+; Position of the camera relative to the car in meters.
+PositionX=0.20
+PositionY=0
+PositionZ=1.30
+; Rotation of the camera relative to the car in degrees.
+RotationPitch=8
+RotationRoll=0
+RotationYaw=0
+
+[CARLA/Sensor/FrontCamera/Depth]
+; The sensor can be defined in a subsection of FrontCamera so it inherits the
+; values in FrontCamera. This adds a camera similar to FrontCamera but generating
+; depth map images instead.
+PostProcessing=Depth
+
+[CARLA/Sensor/MyLidar]
+SensorType=LIDAR_RAY_CAST
+; Number of lasers.
+Channels=32
+; Measure distance in meters.
+Range=50.0
+; Points generated by all lasers per second.
+PointsPerSecond=100000
+; Lidar rotation frequency.
+RotationFrequency=10
+; Upper and lower laser angles, positive values means above horizontal line.
+UpperFOVLimit=10
+LowerFOVLimit=-30
+; Position and rotation relative to the vehicle.
+PositionX=0
+PositionY=0
+PositionZ=1.40
+RotationPitch=0
+RotationYaw=0
+RotationRoll=0
--- a/rl_coach/environments/README.md
+++ b/rl_coach/environments/README.md
@@ -0,0 +1,19 @@
+A custom environment implementation should look like this:
+
+```bash
+from coach.filters.input_filter import InputFilter
+
+class CustomFilter(InputFilter):
+  def __init__(self):
+    ...
+  def _filter(self, env_response: EnvResponse) -> EnvResponse:
+    ...
+  def _get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace:
+    ...
+  def _get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
+    ...
+  def _validate_input_observation_space(self, input_observation_space: ObservationSpace):
+    ...
+  def _reset(self):
+    ...
+```
--- a/rl_coach/environments/init.py
+++ b/rl_coach/environments/init.py
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
--- a/rl_coach/environments/carla_environment.py
+++ b/rl_coach/environments/carla_environment.py
@@ -0,0 +1,357 @@
+import random
+import sys
+from os import path, environ
+
+from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
+
+from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
+
+try:
+    if 'CARLA_ROOT' in environ:
+        sys.path.append(path.join(environ.get('CARLA_ROOT'), 'PythonClient'))
+    from carla.client import CarlaClient
+    from carla.settings import CarlaSettings
+    from carla.tcp import TCPConnectionError
+    from carla.sensor import Camera
+    from carla.client import VehicleControl
+except ImportError:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("CARLA")
+
+import logging
+import subprocess
+from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
+from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, StateSpace, \
+    VectorObservationSpace
+from rl_coach.utils import get_open_port, force_list
+from enum import Enum
+import os
+import signal
+from typing import List, Union
+from rl_coach.base_parameters import VisualizationParameters
+from rl_coach.filters.filter import InputFilter, NoOutputFilter
+from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
+from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
+import numpy as np
+
+
+# enum of the available levels and their path
+class CarlaLevel(Enum):
+    TOWN1 = "/Game/Maps/Town01"
+    TOWN2 = "/Game/Maps/Town02"
+
+key_map = {
+    'BRAKE': (274,),  # down arrow
+    'GAS': (273,),  # up arrow
+    'TURN_LEFT': (276,),  # left arrow
+    'TURN_RIGHT': (275,),  # right arrow
+    'GAS_AND_TURN_LEFT': (273, 276),
+    'GAS_AND_TURN_RIGHT': (273, 275),
+    'BRAKE_AND_TURN_LEFT': (274, 276),
+    'BRAKE_AND_TURN_RIGHT': (274, 275),
+}
+
+CarlaInputFilter = InputFilter(is_a_reference_filter=True)
+CarlaInputFilter.add_observation_filter('forward_camera', 'rescaling',
+                                        ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([128, 180, 3]),
+                                                                                             high=255)))
+CarlaInputFilter.add_observation_filter('forward_camera', 'to_grayscale', ObservationRGBToYFilter())
+CarlaInputFilter.add_observation_filter('forward_camera', 'to_uint8', ObservationToUInt8Filter(0, 255))
+CarlaInputFilter.add_observation_filter('forward_camera', 'stacking', ObservationStackingFilter(4))
+
+CarlaOutputFilter = NoOutputFilter()
+
+
+class CameraTypes(Enum):
+    FRONT = "forward_camera"
+    LEFT = "left_camera"
+    RIGHT = "right_camera"
+    SEGMENTATION = "segmentation"
+    DEPTH = "depth"
+    LIDAR = "lidar"
+
+
+class CarlaEnvironmentParameters(EnvironmentParameters):
+    class Quality(Enum):
+        LOW = "Low"
+        EPIC = "Epic"
+
+    def __init__(self):
+        super().__init__()
+        self.frame_skip = 3  # the frame skip affects the fps of the server directly. fps = 30 / frameskip
+        self.server_height = 512
+        self.server_width = 720
+        self.camera_height = 128
+        self.camera_width = 180
+        self.config = None #'environments/CarlaSettings.ini'  # TODO: remove the config to prevent confusion
+        self.level = 'town1'
+        self.quality = self.Quality.LOW
+        self.cameras = [CameraTypes.FRONT]
+        self.weather_id = [1]
+        self.verbose = True
+        self.episode_max_time = 100000  # miliseconds for each episode
+        self.allow_braking = False
+        self.default_input_filter = CarlaInputFilter
+        self.default_output_filter = CarlaOutputFilter
+
+    @property
+    def path(self):
+        return 'rl_coach.environments.carla_environment:CarlaEnvironment'
+
+
+class CarlaEnvironment(Environment):
+    def __init__(self, level: LevelSelection,
+                 seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float],
+                 visualization_parameters: VisualizationParameters,
+                 server_height: int, server_width: int, camera_height: int, camera_width: int,
+                 verbose: bool, config: str, episode_max_time: int,
+                 allow_braking: bool, quality: CarlaEnvironmentParameters.Quality,
+                 cameras: List[CameraTypes], weather_id: List[int], **kwargs):
+        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
+
+        # server configuration
+        self.server_height = server_height
+        self.server_width = server_width
+        self.port = get_open_port()
+        self.host = 'localhost'
+        self.map = self.env_id
+
+        # client configuration
+        self.verbose = verbose
+        self.quality = quality
+        self.cameras = cameras
+        self.weather_id = weather_id
+        self.episode_max_time = episode_max_time
+        self.allow_braking = allow_braking
+        self.camera_width = camera_width
+        self.camera_height = camera_height
+
+        # state space
+        self.state_space = StateSpace({
+            "measurements": VectorObservationSpace(4, measurements_names=["forward_speed", "x", "y", "z"])
+        })
+        for camera in self.cameras:
+            self.state_space[camera.value] = ImageObservationSpace(
+                shape=np.array([self.camera_height, self.camera_width, 3]),
+                high=255)
+
+        # setup server settings
+        self.config = config
+        if self.config:
+            # load settings from file
+            with open(self.config, 'r') as fp:
+                self.settings = fp.read()
+        else:
+            # hard coded settings
+            self.settings = CarlaSettings()
+            self.settings.set(
+                SynchronousMode=True,
+                SendNonPlayerAgentsInfo=False,
+                NumberOfVehicles=15,
+                NumberOfPedestrians=30,
+                WeatherId=random.choice(force_list(self.weather_id)),
+                QualityLevel=self.quality.value)
+            self.settings.randomize_seeds()
+
+            self.settings = self._add_cameras(self.settings, self.cameras, self.camera_width, self.camera_height)
+
+        # open the server
+        self.server = self._open_server()
+
+        logging.disable(40)
+
+        # open the client
+        self.game = CarlaClient(self.host, self.port, timeout=99999999)
+        self.game.connect()
+        scene = self.game.load_settings(self.settings)
+
+        # get available start positions
+        positions = scene.player_start_spots
+        self.num_pos = len(positions)
+        self.iterator_start_positions = 0
+
+        # action space
+        self.action_space = BoxActionSpace(shape=2, low=np.array([-1, -1]), high=np.array([1, 1]))
+
+        # human control
+        if self.human_control:
+            # convert continuous action space to discrete
+            self.steering_strength = 0.5
+            self.gas_strength = 1.0
+            self.brake_strength = 0.5
+            self.action_space = PartialDiscreteActionSpaceMap(
+                target_actions=[[0., 0.],
+                                [0., -self.steering_strength],
+                                [0., self.steering_strength],
+                                [self.gas_strength, 0.],
+                                [-self.brake_strength, 0],
+                                [self.gas_strength, -self.steering_strength],
+                                [self.gas_strength, self.steering_strength],
+                                [self.brake_strength, -self.steering_strength],
+                                [self.brake_strength, self.steering_strength]],
+                target_action_space=self.action_space,
+                descriptions=['NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE',
+                              'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT',
+                              'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT']
+            )
+
+            # map keyboard keys to actions
+            for idx, action in enumerate(self.action_space.descriptions):
+                for key in key_map.keys():
+                    if action == key:
+                        self.key_to_action[key_map[key]] = idx
+
+        self.num_speedup_steps = 30
+
+        # measurements
+        self.autopilot = None
+
+        # env initialization
+        self.reset_internal_state(True)
+
+        # render
+        if self.is_rendered:
+            image = self.get_rendered_image()
+            self.renderer.create_screen(image.shape[1], image.shape[0])
+
+    def _add_cameras(self, settings, cameras, camera_width, camera_height):
+        # add a front facing camera
+        if CameraTypes.FRONT in cameras:
+            camera = Camera(CameraTypes.FRONT.value)
+            camera.set_image_size(camera_width, camera_height)
+            camera.set_position(0.2, 0, 1.3)
+            camera.set_rotation(8, 0, 0)
+            settings.add_sensor(camera)
+
+        # add a left facing camera
+        if CameraTypes.LEFT in cameras:
+            camera = Camera(CameraTypes.LEFT.value)
+            camera.set_image_size(camera_width, camera_height)
+            camera.set_position(0.2, 0, 1.3)
+            camera.set_rotation(8, -30, 0)
+            settings.add_sensor(camera)
+
+        # add a right facing camera
+        if CameraTypes.RIGHT in cameras:
+            camera = Camera(CameraTypes.RIGHT.value)
+            camera.set_image_size(camera_width, camera_height)
+            camera.set_position(0.2, 0, 1.3)
+            camera.set_rotation(8, 30, 0)
+            settings.add_sensor(camera)
+
+        # add a front facing depth camera
+        if CameraTypes.DEPTH in cameras:
+            camera = Camera(CameraTypes.DEPTH.value)
+            camera.set_image_size(camera_width, camera_height)
+            camera.set_position(0.2, 0, 1.3)
+            camera.set_rotation(8, 30, 0)
+            camera.PostProcessing = 'Depth'
+            settings.add_sensor(camera)
+
+        # add a front facing semantic segmentation camera
+        if CameraTypes.SEGMENTATION in cameras:
+            camera = Camera(CameraTypes.SEGMENTATION.value)
+            camera.set_image_size(camera_width, camera_height)
+            camera.set_position(0.2, 0, 1.3)
+            camera.set_rotation(8, 30, 0)
+            camera.PostProcessing = 'SemanticSegmentation'
+            settings.add_sensor(camera)
+
+        return settings
+
+    def _open_server(self):
+        # TODO: get experiment path
+        log_path = path.join('./logs/', "CARLA_LOG_{}.txt".format(self.port))
+        with open(log_path, "wb") as out:
+            cmd = [path.join(environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map,
+                   "-benchmark", "-carla-server", "-fps={}".format(30 / self.frame_skip),
+                   "-world-port={}".format(self.port),
+                   "-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height),
+                   "-carla-no-hud"]
+
+            if self.config:
+                cmd.append("-carla-settings={}".format(self.config))
+            p = subprocess.Popen(cmd, stdout=out, stderr=out)
+
+        return p
+
+    def _close_server(self):
+        os.killpg(os.getpgid(self.server.pid), signal.SIGKILL)
+
+    def _update_state(self):
+        # get measurements and observations
+        measurements = []
+        while type(measurements) == list:
+            measurements, sensor_data = self.game.read_data()
+        self.state = {}
+
+        for camera in self.cameras:
+            self.state[camera.value] = sensor_data[camera.value].data
+
+        self.location = [measurements.player_measurements.transform.location.x,
+                         measurements.player_measurements.transform.location.y,
+                         measurements.player_measurements.transform.location.z]
+
+        is_collision = measurements.player_measurements.collision_vehicles != 0 \
+                       or measurements.player_measurements.collision_pedestrians != 0 \
+                       or measurements.player_measurements.collision_other != 0
+
+        speed_reward = measurements.player_measurements.forward_speed - 1
+        if speed_reward > 30.:
+            speed_reward = 30.
+        self.reward = speed_reward \
+                      - (measurements.player_measurements.intersection_otherlane * 5) \
+                      - (measurements.player_measurements.intersection_offroad * 5) \
+                      - is_collision * 100 \
+                      - np.abs(self.control.steer) * 10
+
+        # update measurements
+        self.measurements = [measurements.player_measurements.forward_speed] + self.location
+        self.autopilot = measurements.player_measurements.autopilot_control
+
+        # action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]]
+        # screen.success('REWARD: %.2f, ACTIONS: %s' % (self.reward, action_p))
+
+        if (measurements.game_timestamp >= self.episode_max_time) or is_collision:
+            # screen.success('EPISODE IS DONE. GameTime: {}, Collision: {}'.format(str(measurements.game_timestamp),
+            #                                                                      str(is_collision)))
+            self.done = True
+
+        self.state['measurements'] = self.measurements
+
+    def _take_action(self, action):
+        self.control = VehicleControl()
+        self.control.throttle = np.clip(action[0], 0, 1)
+        self.control.steer = np.clip(action[1], -1, 1)
+        self.control.brake = np.abs(np.clip(action[0], -1, 0))
+        if not self.allow_braking:
+            self.control.brake = 0
+        self.control.hand_brake = False
+        self.control.reverse = False
+
+        self.game.send_control(self.control)
+
+    def _restart_environment_episode(self, force_environment_reset=False):
+        self.iterator_start_positions += 1
+        if self.iterator_start_positions >= self.num_pos:
+            self.iterator_start_positions = 0
+
+        try:
+            self.game.start_episode(self.iterator_start_positions)
+        except:
+            self.game.connect()
+            self.game.start_episode(self.iterator_start_positions)
+
+        # start the game with some initial speed
+        for i in range(self.num_speedup_steps):
+            self._take_action([1.0, 0])
+
+    def get_rendered_image(self) -> np.ndarray:
+        """
+        Return a numpy array containing the image that will be rendered to the screen.
+        This can be different from the observation. For example, mujoco's observation is a measurements vector.
+        :return: numpy array containing the image that will be rendered to the screen
+        """
+        image = [self.state[camera.value] for camera in self.cameras]
+        image = np.vstack(image)
+        return image
--- a/rl_coach/environments/control_suite_environment.py
+++ b/rl_coach/environments/control_suite_environment.py
@@ -0,0 +1,162 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+
+import random
+from enum import Enum
+from typing import Union
+
+import numpy as np
+
+try:
+    from dm_control import suite
+    from dm_control.suite.wrappers import pixels
+except ImportError:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("DeepMind Control Suite")
+
+from rl_coach.base_parameters import VisualizationParameters
+from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
+from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
+from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, VectorObservationSpace, StateSpace
+
+
+class ObservationType(Enum):
+    Measurements = 1
+    Image = 2
+    Image_and_Measurements = 3
+
+
+# Parameters
+class ControlSuiteEnvironmentParameters(EnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.observation_type = ObservationType.Measurements
+        self.default_input_filter = ControlSuiteInputFilter
+        self.default_output_filter = ControlSuiteOutputFilter
+
+    @property
+    def path(self):
+        return 'rl_coach.environments.control_suite_environment:ControlSuiteEnvironment'
+
+
+"""
+ControlSuite Environment Components
+"""
+ControlSuiteInputFilter = NoInputFilter()
+ControlSuiteOutputFilter = NoOutputFilter()
+
+control_suite_envs = {':'.join(env): ':'.join(env) for env in suite.BENCHMARKING}
+
+
+# Environment
+class ControlSuiteEnvironment(Environment):
+    def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
+                 seed: Union[None, int]=None, human_control: bool=False,
+                 observation_type: ObservationType=ObservationType.Measurements,
+                 custom_reward_threshold: Union[int, float]=None, **kwargs):
+        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
+
+        self.observation_type = observation_type
+
+        # load and initialize environment
+        domain_name, task_name = self.env_id.split(":")
+        self.env = suite.load(domain_name=domain_name, task_name=task_name)
+
+        if observation_type != ObservationType.Measurements:
+            self.env = pixels.Wrapper(self.env, pixels_only=observation_type == ObservationType.Image)
+
+        # seed
+        if self.seed is not None:
+            np.random.seed(self.seed)
+            random.seed(self.seed)
+
+        self.state_space = StateSpace({})
+
+        # image observations
+        if observation_type != ObservationType.Measurements:
+            self.state_space['pixels'] = ImageObservationSpace(shape=self.env.observation_spec()['pixels'].shape,
+                                                               high=255)
+
+        # measurements observations
+        if observation_type != ObservationType.Image:
+            measurements_space_size = 0
+            measurements_names = []
+            for observation_space_name, observation_space in self.env.observation_spec().items():
+                if len(observation_space.shape) == 0:
+                    measurements_space_size += 1
+                    measurements_names.append(observation_space_name)
+                elif len(observation_space.shape) == 1:
+                    measurements_space_size += observation_space.shape[0]
+                    measurements_names.extend(["{}_{}".format(observation_space_name, i) for i in
+                                               range(observation_space.shape[0])])
+            self.state_space['measurements'] = VectorObservationSpace(shape=measurements_space_size,
+                                                                      measurements_names=measurements_names)
+
+        # actions
+        self.action_space = BoxActionSpace(
+            shape=self.env.action_spec().shape[0],
+            low=self.env.action_spec().minimum,
+            high=self.env.action_spec().maximum
+        )
+
+        # initialize the state by getting a new state from the environment
+        self.reset_internal_state(True)
+
+        # render
+        if self.is_rendered:
+            image = self.get_rendered_image()
+            scale = 1
+            if self.human_control:
+                scale = 2
+            if not self.native_rendering:
+                self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
+
+    def _update_state(self):
+        self.state = {}
+
+        if self.observation_type != ObservationType.Measurements:
+            self.pixels = self.last_result.observation['pixels']
+            self.state['pixels'] = self.pixels
+
+        if self.observation_type != ObservationType.Image:
+            self.measurements = np.array([])
+            for sub_observation in self.last_result.observation.values():
+                if isinstance(sub_observation, np.ndarray) and len(sub_observation.shape) == 1:
+                    self.measurements = np.concatenate((self.measurements, sub_observation))
+                else:
+                    self.measurements = np.concatenate((self.measurements, np.array([sub_observation])))
+            self.state['measurements'] = self.measurements
+
+        self.reward = self.last_result.reward if self.last_result.reward is not None else 0
+
+        self.done = self.last_result.last()
+
+    def _take_action(self, action):
+        if type(self.action_space) == BoxActionSpace:
+            action = self.action_space.clip_action_to_space(action)
+
+        self.last_result = self.env.step(action)
+
+    def _restart_environment_episode(self, force_environment_reset=False):
+        self.last_result = self.env.reset()
+
+    def _render(self):
+        pass
+
+    def get_rendered_image(self):
+        return self.env.physics.render(camera_id=0)
--- a/rl_coach/environments/doom/D2_navigation.cfg
+++ b/rl_coach/environments/doom/D2_navigation.cfg
@@ -0,0 +1,39 @@
+# Lines starting with # are treated as comments (or with whitespaces+#).
+# It doesn't matter if you use capital letters or not.
+# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
+
+doom_scenario_path = D2_navigation.wad
+doom_map = map01
+
+# Rewards
+
+# Each step is good for you!
+living_reward = 1
+# And death is not!
+death_penalty = 0
+
+# Rendering options
+screen_resolution = RES_160X120
+screen_format = GRAY8
+render_hud = false
+render_crosshair = false
+render_weapon = false
+render_decals = false
+render_particles = false
+window_visible = false
+
+# make episodes finish after 2100 actions (tics)
+episode_timeout = 2100
+
+# Available buttons
+available_buttons = 
+	{ 
+		TURN_LEFT 
+		TURN_RIGHT 
+		MOVE_FORWARD 
+	}
+
+# Game variables that will be in the state
+available_game_variables = { HEALTH }
+
+mode = PLAYER
--- a/rl_coach/environments/doom/D2_navigation.wad
+++ b/rl_coach/environments/doom/D2_navigation.wad
--- a/rl_coach/environments/doom/D3_battle.cfg
+++ b/rl_coach/environments/doom/D3_battle.cfg
@@ -0,0 +1,44 @@
+# Lines starting with # are treated as comments (or with whitespaces+#).
+# It doesn't matter if you use capital letters or not.
+# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
+
+# modifty these to point to your vizdoom binary and freedoom2.wad
+doom_scenario_path = D3_battle.wad
+doom_map = map01
+
+# Rewards
+
+living_reward = 0
+death_penalty = 0
+
+# Rendering options
+screen_resolution = RES_320X240
+screen_format = CRCGCB
+render_hud = false
+render_crosshair = true
+render_weapon = true
+render_decals = false
+render_particles = false
+window_visible = false
+
+# make episodes finish after 2100 actions (tics)
+episode_timeout = 2100
+
+# Available buttons
+available_buttons = 
+    { 
+        MOVE_FORWARD
+        MOVE_BACKWARD
+        MOVE_RIGHT
+        MOVE_LEFT       
+        TURN_LEFT
+        TURN_RIGHT
+        ATTACK
+        SPEED
+    }
+
+# Game variables that will be in the state
+available_game_variables = {AMMO2 HEALTH USER2}
+
+mode = PLAYER
+doom_skill = 2
--- a/rl_coach/environments/doom/D3_battle.wad
+++ b/rl_coach/environments/doom/D3_battle.wad
--- a/rl_coach/environments/doom_environment.py
+++ b/rl_coach/environments/doom_environment.py
@@ -0,0 +1,229 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+try:
+    import vizdoom
+except ImportError:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("ViZDoom")
+
+import os
+from enum import Enum
+from os import path, environ
+from typing import Union, List
+
+import numpy as np
+from rl_coach.base_parameters import VisualizationParameters
+from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
+from rl_coach.filters.action.full_discrete_action_space_map import FullDiscreteActionSpaceMap
+from rl_coach.filters.filter import InputFilter, OutputFilter
+from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
+from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
+from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
+from rl_coach.spaces import MultiSelectActionSpace, ImageObservationSpace, \
+    VectorObservationSpace, StateSpace
+
+from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
+
+
+# enum of the available levels and their path
+class DoomLevel(Enum):
+    BASIC = "basic.cfg"
+    DEFEND = "defend_the_center.cfg"
+    DEATHMATCH = "deathmatch.cfg"
+    MY_WAY_HOME = "my_way_home.cfg"
+    TAKE_COVER = "take_cover.cfg"
+    HEALTH_GATHERING = "health_gathering.cfg"
+    HEALTH_GATHERING_SUPREME_COACH_LOCAL = "D2_navigation.cfg"  # from https://github.com/IntelVCL/DirectFuturePrediction/tree/master/maps
+    DEFEND_THE_LINE = "defend_the_line.cfg"
+    DEADLY_CORRIDOR = "deadly_corridor.cfg"
+    BATTLE_COACH_LOCAL = "D3_battle.cfg"  # from https://github.com/IntelVCL/DirectFuturePrediction/tree/master/maps
+
+key_map = {
+    'NO-OP': 96,  # `
+    'ATTACK': 13,  # enter
+    'CROUCH': 306,  # ctrl
+    'DROP_SELECTED_ITEM': ord("t"),
+    'DROP_SELECTED_WEAPON': ord("t"),
+    'JUMP': 32,  # spacebar
+    'LAND': ord("l"),
+    'LOOK_DOWN': 274,  # down arrow
+    'LOOK_UP': 273,  # up arrow
+    'MOVE_BACKWARD': ord("s"),
+    'MOVE_DOWN': ord("s"),
+    'MOVE_FORWARD': ord("w"),
+    'MOVE_LEFT': 276,
+    'MOVE_RIGHT': 275,
+    'MOVE_UP': ord("w"),
+    'RELOAD': ord("r"),
+    'SELECT_NEXT_WEAPON': ord("q"),
+    'SELECT_PREV_WEAPON': ord("e"),
+    'SELECT_WEAPON0': ord("0"),
+    'SELECT_WEAPON1': ord("1"),
+    'SELECT_WEAPON2': ord("2"),
+    'SELECT_WEAPON3': ord("3"),
+    'SELECT_WEAPON4': ord("4"),
+    'SELECT_WEAPON5': ord("5"),
+    'SELECT_WEAPON6': ord("6"),
+    'SELECT_WEAPON7': ord("7"),
+    'SELECT_WEAPON8': ord("8"),
+    'SELECT_WEAPON9': ord("9"),
+    'SPEED': 304,  # shift
+    'STRAFE': 9,  # tab
+    'TURN180': ord("u"),
+    'TURN_LEFT': ord("a"),  # left arrow
+    'TURN_RIGHT': ord("d"),  # right arrow
+    'USE': ord("f"),
+}
+
+
+DoomInputFilter = InputFilter(is_a_reference_filter=True)
+DoomInputFilter.add_observation_filter('observation', 'rescaling',
+                                       ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([60, 76, 3]),
+                                                                                            high=255)))
+DoomInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
+DoomInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
+DoomInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(3))
+
+
+DoomOutputFilter = OutputFilter(is_a_reference_filter=True)
+DoomOutputFilter.add_action_filter('to_discrete', FullDiscreteActionSpaceMap())
+
+
+class DoomEnvironmentParameters(EnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.default_input_filter = DoomInputFilter
+        self.default_output_filter = DoomOutputFilter
+        self.cameras = [DoomEnvironment.CameraTypes.OBSERVATION]
+
+    @property
+    def path(self):
+        return 'rl_coach.environments.doom_environment:DoomEnvironment'
+
+
+class DoomEnvironment(Environment):
+    class CameraTypes(Enum):
+        OBSERVATION = ("observation", "screen_buffer")
+        DEPTH = ("depth", "depth_buffer")
+        LABELS = ("labels", "labels_buffer")
+        MAP = ("map", "automap_buffer")
+
+    def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
+                 custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
+                 cameras: List[CameraTypes], **kwargs):
+        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
+
+        self.cameras = cameras
+
+        # load the emulator with the required level
+        self.level = DoomLevel[level.upper()]
+        local_scenarios_path = path.join(os.path.dirname(os.path.realpath(__file__)), 'doom')
+        self.scenarios_dir = local_scenarios_path if 'COACH_LOCAL' in level \
+            else path.join(environ.get('VIZDOOM_ROOT'), 'scenarios')
+
+        self.game = vizdoom.DoomGame()
+        self.game.load_config(path.join(self.scenarios_dir, self.level.value))
+        self.game.set_window_visible(False)
+        self.game.add_game_args("+vid_forcesurface 1")
+
+        self.wait_for_explicit_human_action = True
+        if self.human_control:
+            self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_640X480)
+        elif self.is_rendered:
+            self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_320X240)
+        else:
+            # lower resolution since we actually take only 76x60 and we don't need to render
+            self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_160X120)
+
+        self.game.set_render_hud(False)
+        self.game.set_render_crosshair(False)
+        self.game.set_render_decals(False)
+        self.game.set_render_particles(False)
+        for camera in self.cameras:
+            if hasattr(self.game, 'set_{}_enabled'.format(camera.value[1])):
+                getattr(self.game, 'set_{}_enabled'.format(camera.value[1]))(True)
+        self.game.init()
+
+        # actions
+        actions_description = ['NO-OP']
+        actions_description += [str(action).split(".")[1] for action in self.game.get_available_buttons()]
+        actions_description = actions_description[::-1]
+        self.action_space = MultiSelectActionSpace(self.game.get_available_buttons_size(),
+                                                   max_simultaneous_selected_actions=1,
+                                                   descriptions=actions_description,
+                                                   allow_no_action_to_be_selected=True)
+
+        # human control
+        if self.human_control:
+            # TODO: add this to the action space
+            # map keyboard keys to actions
+            for idx, action in enumerate(self.action_space.descriptions):
+                if action in key_map.keys():
+                    self.key_to_action[(key_map[action],)] = idx
+
+        # states
+        self.state_space = StateSpace({
+            "measurements": VectorObservationSpace(self.game.get_state().game_variables.shape[0],
+                                                   measurements_names=[str(m) for m in
+                                                                       self.game.get_available_game_variables()])
+        })
+        for camera in self.cameras:
+            self.state_space[camera.value[0]] = ImageObservationSpace(
+                shape=np.array([self.game.get_screen_height(), self.game.get_screen_width(), 3]),
+                high=255)
+
+        # seed
+        if seed is not None:
+            self.game.set_seed(seed)
+        self.reset_internal_state()
+
+        # render
+        if self.is_rendered:
+            image = self.get_rendered_image()
+            self.renderer.create_screen(image.shape[1], image.shape[0])
+
+    def _update_state(self):
+        # extract all data from the current state
+        state = self.game.get_state()
+        if state is not None and state.screen_buffer is not None:
+            self.measurements = state.game_variables
+            self.state = {'measurements': self.measurements}
+            for camera in self.cameras:
+                observation = getattr(state, camera.value[1])
+                if len(observation.shape) == 3:
+                    self.state[camera.value[0]] = np.transpose(observation, (1, 2, 0))
+                elif len(observation.shape) == 2:
+                    self.state[camera.value[0]] = np.repeat(np.expand_dims(observation, -1), 3, axis=-1)
+
+        self.reward = self.game.get_last_reward()
+        self.done = self.game.is_episode_finished()
+
+    def _take_action(self, action):
+        self.game.make_action(list(action), self.frame_skip)
+
+    def _restart_environment_episode(self, force_environment_reset=False):
+        self.game.new_episode()
+
+    def get_rendered_image(self) -> np.ndarray:
+        """
+        Return a numpy array containing the image that will be rendered to the screen.
+        This can be different from the observation. For example, mujoco's observation is a measurements vector.
+        :return: numpy array containing the image that will be rendered to the screen
+        """
+        image = [self.state[camera.value[0]] for camera in self.cameras]
+        image = np.vstack(image)
+        return image
--- a/rl_coach/environments/environment.py
+++ b/rl_coach/environments/environment.py
@@ -0,0 +1,540 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import operator
+import time
+from collections import OrderedDict
+from typing import Union, List, Tuple, Dict
+
+import numpy as np
+from rl_coach.base_parameters import Parameters
+from rl_coach.base_parameters import VisualizationParameters
+from rl_coach.core_types import GoalType, ActionType, EnvResponse, RunPhase
+from rl_coach.renderer import Renderer
+from rl_coach.spaces import ActionSpace, ObservationSpace, DiscreteActionSpace, RewardSpace, StateSpace
+from rl_coach.utils import squeeze_list, force_list
+
+from rl_coach import logger
+from rl_coach.environments.environment_interface import EnvironmentInterface
+from rl_coach.logger import screen
+
+
+class LevelSelection(object):
+    def __init__(self, level: str):
+        self.selected_level = level
+
+    def select(self, level: str):
+        self.selected_level = level
+
+    def __str__(self):
+        if self.selected_level is None:
+            logger.screen.error("No level has been selected. Please select a level using the -lvl command line flag, "
+                                "or change the level in the preset.", crash=True)
+        return self.selected_level
+
+
+class SingleLevelSelection(LevelSelection):
+    def __init__(self, levels: Union[str, List[str], Dict[str, str]]):
+        super().__init__(None)
+        self.levels = levels
+        if isinstance(levels, list):
+            self.levels = {level: level for level in levels}
+        if isinstance(levels, str):
+            self.levels = {levels: levels}
+
+    def __str__(self):
+        if self.selected_level is None:
+            logger.screen.error("No level has been selected. Please select a level using the -lvl command line flag, "
+                                "or change the level in the preset. \nThe available levels are: \n{}"
+                                .format(', '.join(self.levels.keys())), crash=True)
+        if self.selected_level not in self.levels.keys():
+            logger.screen.error("The selected level ({}) is not part of the available levels ({})"
+                                .format(self.selected_level, ', '.join(self.levels.keys())), crash=True)
+        return self.levels[self.selected_level]
+
+
+# class SingleLevelPerPhase(LevelSelection):
+#     def __init__(self, levels: Dict[RunPhase, str]):
+#         super().__init__(None)
+#         self.levels = levels
+#
+#     def __str__(self):
+#         super().__str__()
+#         if self.selected_level not in self.levels.keys():
+#             logger.screen.error("The selected level ({}) is not part of the available levels ({})"
+#                                 .format(self.selected_level, self.levels.keys()), crash=True)
+#         return self.levels[self.selected_level]
+
+
+class CustomWrapper(object):
+    def __init__(self, environment):
+        super().__init__()
+        self.environment = environment
+
+    def __getattr__(self, attr):
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        else:
+            return getattr(self.environment, attr, False)
+
+
+class EnvironmentParameters(Parameters):
+    def __init__(self):
+        super().__init__()
+        self.level = None
+        self.frame_skip = 4
+        self.seed = None
+        self.human_control = False
+        self.custom_reward_threshold = None
+        self.default_input_filter = None
+        self.default_output_filter = None
+
+    @property
+    def path(self):
+        return 'rl_coach.environments.environment:Environment'
+
+
+class Environment(EnvironmentInterface):
+    def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
+                 custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
+                 **kwargs):
+        """
+        :param level: The environment level. Each environment can have multiple levels
+        :param seed: a seed for the random number generator of the environment
+        :param frame_skip: number of frames to skip (while repeating the same action) between each two agent directives
+        :param human_control: human should control the environment
+        :param visualization_parameters: a blob of parameters used for visualization of the environment
+        :param **kwargs: as the class is instantiated by EnvironmentParameters, this is used to support having
+                         additional arguments which will be ignored by this class, but might be used by others
+        """
+        super().__init__()
+
+        # env initialization
+
+        self.game = []
+
+        self.state = {}
+        self.observation = None
+        self.goal = None
+        self.reward = 0
+        self.done = False
+        self.info = {}
+        self._last_env_response = None
+        self.last_action = 0
+        self.episode_idx = 0
+        self.total_steps_counter = 0
+        self.current_episode_steps_counter = 0
+        self.last_episode_time = time.time()
+        self.key_to_action = {}
+        self.last_episode_images = []
+
+        # rewards
+        self.total_reward_in_current_episode = 0
+        self.max_reward_achieved = -np.inf
+        self.reward_success_threshold = custom_reward_threshold
+
+        # spaces
+        self.state_space = self._state_space = None
+        self.goal_space = self._goal_space = None
+        self.action_space = self._action_space = None
+        self.reward_space = RewardSpace(1, reward_success_threshold=self.reward_success_threshold)  # TODO: add a getter and setter
+
+        self.env_id = str(level)
+        self.seed = seed
+        self.frame_skip = frame_skip
+
+        # human interaction and visualization
+        self.human_control = human_control
+        self.wait_for_explicit_human_action = False
+        self.is_rendered = visualization_parameters.render or self.human_control
+        self.native_rendering = visualization_parameters.native_rendering or self.human_control
+        self.visualization_parameters = visualization_parameters
+        if not self.native_rendering:
+            self.renderer = Renderer()
+
+    @property
+    def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
+        """
+        Get the action space of the environment
+        :return: the action space
+        """
+        return self._action_space
+
+    @action_space.setter
+    def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
+        """
+        Set the action space of the environment
+        :return: None
+        """
+        self._action_space = val
+
+    @property
+    def state_space(self) -> Union[List[StateSpace], StateSpace]:
+        """
+        Get the state space of the environment
+        :return: the observation space
+        """
+        return self._state_space
+
+    @state_space.setter
+    def state_space(self, val: Union[List[StateSpace], StateSpace]):
+        """
+        Set the state space of the environment
+        :return: None
+        """
+        self._state_space = val
+
+    @property
+    def goal_space(self) -> Union[List[ObservationSpace], ObservationSpace]:
+        """
+        Get the state space of the environment
+        :return: the observation space
+        """
+        return self._goal_space
+
+    @goal_space.setter
+    def goal_space(self, val: Union[List[ObservationSpace], ObservationSpace]):
+        """
+        Set the goal space of the environment
+        :return: None
+        """
+        self._goal_space = val
+
+    def get_action_from_user(self) -> ActionType:
+        """
+        Get an action from the user keyboard
+        :return: action index
+        """
+        if self.wait_for_explicit_human_action:
+            while len(self.renderer.pressed_keys) == 0:
+                self.renderer.get_events()
+
+        if self.key_to_action == {}:
+            # the keys are the numbers on the keyboard corresponding to the action index
+            if len(self.renderer.pressed_keys) > 0:
+                action_idx = self.renderer.pressed_keys[0] - ord("1")
+                if 0 <= action_idx < self.action_space.shape[0]:
+                    return action_idx
+        else:
+            # the keys are mapped through the environment to more intuitive keyboard keys
+            # key = tuple(self.renderer.pressed_keys)
+            # for key in self.renderer.pressed_keys:
+            for env_keys in self.key_to_action.keys():
+                if set(env_keys) == set(self.renderer.pressed_keys):
+                    return self.action_space.actions[self.key_to_action[env_keys]]
+
+        # return the default action 0 so that the environment will continue running
+        return self.action_space.default_action
+
+    @property
+    def last_env_response(self) -> Union[List[EnvResponse], EnvResponse]:
+        """
+        Get the last environment response
+        :return: a dictionary that contains the state, reward, etc.
+        """
+        return squeeze_list(self._last_env_response)
+
+    @last_env_response.setter
+    def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
+        """
+        Set the last environment response
+        :param val: the last environment response
+        """
+        self._last_env_response = force_list(val)
+
+    def step(self, action: ActionType) -> EnvResponse:
+        """
+        Make a single step in the environment using the given action
+        :param action: an action to use for stepping the environment. Should follow the definition of the action space.
+        :return: the environment response as returned in get_last_env_response
+        """
+        action = self.action_space.clip_action_to_space(action)
+        if self.action_space and not self.action_space.val_matches_space_definition(action):
+            raise ValueError("The given action does not match the action space definition. "
+                             "Action = {}, action space definition = {}".format(action, self.action_space))
+
+        # store the last agent action done and allow passing None actions to repeat the previously done action
+        if action is None:
+            action = self.last_action
+        self.last_action = action
+        if self.visualization_parameters.add_rendered_image_to_env_response:
+            current_rendered_image = self.get_rendered_image()
+
+        self.current_episode_steps_counter += 1
+        if self.phase != RunPhase.UNDEFINED:
+            self.total_steps_counter += 1
+
+        # act
+        self._take_action(action)
+
+        # observe
+        self._update_state()
+
+        if self.is_rendered:
+            self.render()
+
+        self.total_reward_in_current_episode += self.reward
+
+        if self.visualization_parameters.add_rendered_image_to_env_response:
+            self.info['image'] = current_rendered_image
+
+        self.last_env_response = \
+            EnvResponse(
+                reward=self.reward,
+                next_state=self.state,
+                goal=self.goal,
+                game_over=self.done,
+                info=self.info
+            )
+
+        # store observations for video / gif dumping
+        if self.should_dump_video_of_the_current_episode(episode_terminated=False) and \
+            (self.visualization_parameters.dump_mp4 or self.visualization_parameters.dump_gifs):
+            self.last_episode_images.append(self.get_rendered_image())
+
+        return self.last_env_response
+
+    def render(self) -> None:
+        """
+        Call the environment function for rendering to the screen
+        """
+        if self.native_rendering:
+            self._render()
+        else:
+            self.renderer.render_image(self.get_rendered_image())
+
+    def reset_internal_state(self, force_environment_reset=False) -> EnvResponse:
+        """
+        Reset the environment and all the variable of the wrapper
+        :param force_environment_reset: forces environment reset even when the game did not end
+        :return: A dictionary containing the observation, reward, done flag, action and measurements
+        """
+
+        self.dump_video_of_last_episode_if_needed()
+        self._restart_environment_episode(force_environment_reset)
+        self.last_episode_time = time.time()
+
+        if self.current_episode_steps_counter > 0 and self.phase != RunPhase.UNDEFINED:
+            self.episode_idx += 1
+
+        self.done = False
+        self.total_reward_in_current_episode = self.reward = 0.0
+        self.last_action = 0
+        self.current_episode_steps_counter = 0
+        self.last_episode_images = []
+        self._update_state()
+
+        # render before the preprocessing of the observation, so that the image will be in its original quality
+        if self.is_rendered:
+            self.render()
+
+        self.last_env_response = \
+            EnvResponse(
+                reward=self.reward,
+                next_state=self.state,
+                goal=self.goal,
+                game_over=self.done,
+                info=self.info
+            )
+
+        return self.last_env_response
+
+    def get_random_action(self) -> ActionType:
+        """
+        Returns an action picked uniformly from the available actions
+        :return: a numpy array with a random action
+        """
+        return self.action_space.sample()
+
+    def get_available_keys(self) -> List[Tuple[str, ActionType]]:
+        """
+        Return a list of tuples mapping between action names and the keyboard key that triggers them
+        :return: a list of tuples mapping between action names and the keyboard key that triggers them
+        """
+        available_keys = []
+        if self.key_to_action != {}:
+            for key, idx in sorted(self.key_to_action.items(), key=operator.itemgetter(1)):
+                if key != ():
+                    key_names = [self.renderer.get_key_names([k])[0] for k in key]
+                    available_keys.append((self.action_space.descriptions[idx], ' + '.join(key_names)))
+        elif type(self.action_space) == DiscreteActionSpace:
+            for action in range(self.action_space.shape):
+                available_keys.append(("Action {}".format(action + 1), action + 1))
+        return available_keys
+
+    def get_goal(self) -> GoalType:
+        """
+        Get the current goal that the agents needs to achieve in the environment
+        :return: The goal
+        """
+        return self.goal
+
+    def set_goal(self, goal: GoalType) -> None:
+        """
+        Set the current goal that the agent needs to achieve in the environment
+        :param goal: the goal that needs to be achieved
+        :return: None
+        """
+        self.goal = goal
+
+    def should_dump_video_of_the_current_episode(self, episode_terminated=False):
+        if self.visualization_parameters.video_dump_methods:
+            for video_dump_method in force_list(self.visualization_parameters.video_dump_methods):
+                if not video_dump_method.should_dump(episode_terminated, **self.__dict__):
+                    return False
+            return True
+        return False
+
+    def dump_video_of_last_episode_if_needed(self):
+        if self.visualization_parameters.video_dump_methods and self.last_episode_images != []:
+            if self.should_dump_video_of_the_current_episode(episode_terminated=True):
+                self.dump_video_of_last_episode()
+
+    def dump_video_of_last_episode(self):
+        frame_skipping = max(1, int(5 / self.frame_skip))
+        file_name = 'episode-{}_score-{}'.format(self.episode_idx, self.total_reward_in_current_episode)
+        fps = 10
+        if self.visualization_parameters.dump_gifs:
+            logger.create_gif(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
+        if self.visualization_parameters.dump_mp4:
+            logger.create_mp4(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
+
+    def log_to_screen(self):
+        # log to screen
+        log = OrderedDict()
+        log["Episode"] = self.episode_idx
+        log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
+        log["Steps"] = self.total_steps_counter
+        screen.log_dict(log, prefix=self.phase.value)
+
+    # The following functions define the interaction with the environment.
+    # Any new environment that inherits the Environment class should use these signatures.
+    # Some of these functions are optional - please read their description for more details.
+
+    def _take_action(self, action_idx: ActionType) -> None:
+        """
+        An environment dependent function that sends an action to the simulator.
+        :param action_idx: the action to perform on the environment
+        :return: None
+        """
+        raise NotImplementedError("")
+
+    def _update_state(self) -> None:
+        """
+        Updates the state from the environment.
+        Should update self.observation, self.reward, self.done, self.measurements and self.info
+        :return: None
+        """
+        raise NotImplementedError("")
+
+    def _restart_environment_episode(self, force_environment_reset=False) -> None:
+        """
+        Restarts the simulator episode
+        :param force_environment_reset: Force the environment to reset even if the episode is not done yet.
+        :return: None
+        """
+        raise NotImplementedError("")
+
+    def _render(self) -> None:
+        """
+        Renders the environment using the native simulator renderer
+        :return: None
+        """
+        pass
+
+    def get_rendered_image(self) -> np.ndarray:
+        """
+        Return a numpy array containing the image that will be rendered to the screen.
+        This can be different from the observation. For example, mujoco's observation is a measurements vector.
+        :return: numpy array containing the image that will be rendered to the screen
+        """
+        return np.transpose(self.state['observation'], [1, 2, 0])
+
+
+"""
+Video Dumping Methods
+"""
+
+
+class VideoDumpMethod(object):
+    """
+    Method used to decide when to dump videos
+    """
+    def should_dump(self, episode_terminated=False, **kwargs):
+        raise NotImplementedError("")
+
+
+class AlwaysDumpMethod(VideoDumpMethod):
+    """
+    Dump video for every episode
+    """
+    def __init__(self):
+        super().__init__()
+
+    def should_dump(self, episode_terminated=False, **kwargs):
+        return True
+
+
+class MaxDumpMethod(VideoDumpMethod):
+    """
+    Dump video every time a new max total reward has been achieved
+    """
+    def __init__(self):
+        super().__init__()
+        self.max_reward_achieved = -np.inf
+
+    def should_dump(self, episode_terminated=False, **kwargs):
+        # if the episode has not finished yet we want to be prepared for dumping a video
+        if not episode_terminated:
+            return True
+        if kwargs['total_reward_in_current_episode'] > self.max_reward_achieved:
+            self.max_reward_achieved = kwargs['total_reward_in_current_episode']
+            return True
+        else:
+            return False
+
+
+class EveryNEpisodesDumpMethod(object):
+    """
+    Dump videos once in every N episodes
+    """
+    def __init__(self, num_episodes_between_dumps: int):
+        super().__init__()
+        self.num_episodes_between_dumps = num_episodes_between_dumps
+        self.last_dumped_episode = 0
+        if num_episodes_between_dumps < 1:
+            raise ValueError("the number of episodes between dumps should be a positive number")
+
+    def should_dump(self, episode_terminated=False, **kwargs):
+        if kwargs['episode_idx'] >= self.last_dumped_episode + self.num_episodes_between_dumps - 1:
+            self.last_dumped_episode = kwargs['episode_idx']
+            return True
+        else:
+            return False
+
+
+class SelectedPhaseOnlyDumpMethod(object):
+    """
+    Dump videos when the phase of the environment matches a predefined phase
+    """
+    def __init__(self, run_phases: Union[RunPhase, List[RunPhase]]):
+        self.run_phases = force_list(run_phases)
+
+    def should_dump(self, episode_terminated=False, **kwargs):
+        if kwargs['_phase'] in self.run_phases:
+            return True
+        else:
+            return False
--- a/rl_coach/environments/environment_group.py
+++ b/rl_coach/environments/environment_group.py
@@ -0,0 +1,149 @@
+
+########################################################################################################################
+####### Currently we are ignoring more complex cases including EnvironmentGroups - DO NOT USE THIS FILE ****************
+########################################################################################################################
+
+
+
+
+# #
+# # Copyright (c) 2017 Intel Corporation
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #      http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+# #
+#
+# from typing import Union, List, Dict
+# import numpy as np
+# from environments import create_environment
+# from environments.environment import Environment
+# from environments.environment_interface import EnvironmentInterface, ActionType, ActionSpace
+# from core_types import GoalType, Transition
+#
+#
+# class EnvironmentGroup(EnvironmentInterface):
+#     """
+#     An EnvironmentGroup is a group of different environments.
+#     In the simple case, it will contain a single environment. But it can also contain multiple environments,
+#     where the agent can then act on them as a batch, such that the prediction of the action is more efficient.
+#     """
+#     def __init__(self, environments_parameters: List[Environment]):
+#         self.environments_parameters = environments_parameters
+#         self.environments = []
+#         self.action_space = []
+#         self.outgoing_control = []
+#         self._last_env_response = []
+#
+#     @property
+#     def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
+#         """
+#         Get the action space of the environment
+#         :return: the action space
+#         """
+#         return self.action_space
+#
+#     @action_space.setter
+#     def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
+#         """
+#         Set the action space of the environment
+#         :return: None
+#         """
+#         self.action_space = val
+#
+#     @property
+#     def phase(self) -> RunPhase:
+#         """
+#         Get the phase of the environments group
+#         :return: the current phase
+#         """
+#         return self.phase
+#
+#     @phase.setter
+#     def phase(self, val: RunPhase):
+#         """
+#         Change the phase of each one of the environments in the group
+#         :param val: the new phase
+#         :return: None
+#         """
+#         self.phase = val
+#         call_method_for_all(self.environments, 'phase', val)
+#
+#     def _create_environments(self):
+#         """
+#         Create the environments using the given parameters and update the environments list
+#         :return: None
+#         """
+#         for environment_parameters in self.environments_parameters:
+#             environment = create_environment(environment_parameters)
+#             self.action_space = self.action_space.append(environment.action_space)
+#             self.environments.append(environment)
+#
+#    @property
+#    def last_env_response(self) -> Union[List[Transition], Transition]:
+#        """
+#        Get the last environment response
+#        :return: a dictionary that contains the state, reward, etc.
+#        """
+#        return squeeze_list(self._last_env_response)
+#
+#    @last_env_response.setter
+#    def last_env_response(self, val: Union[List[Transition], Transition]):
+#        """
+#        Set the last environment response
+#        :param val: the last environment response
+#        """
+#        self._last_env_response = force_list(val)
+#
+#     def step(self, actions: Union[List[ActionType], ActionType]) -> List[Transition]:
+#         """
+#         Act in all the environments in the group.
+#         :param actions: can be either a single action if there is a single environment in the group, or a list of
+#                         actions in case there are multiple environments in the group. Each action can be an action index
+#                         or a numpy array representing a continuous action for example.
+#         :return: The responses from all the environments in the group
+#         """
+#
+#         actions = force_list(actions)
+#         if len(actions) != len(self.environments):
+#             raise ValueError("The number of actions does not match the number of environments in the group")
+#
+#         result = []
+#         for environment, action in zip(self.environments, actions):
+#             result.append(environment.step(action))
+#
+#         self.last_env_response = result
+#
+#         return result
+#
+#     def reset(self, force_environment_reset: bool=False) -> List[Transition]:
+#         """
+#         Reset all the environments in the group
+#         :param force_environment_reset: force the reset of each one of the environments
+#         :return: a list of the environments responses
+#         """
+#         return call_method_for_all(self.environments, 'reset', force_environment_reset)
+#
+#     def get_random_action(self) -> List[ActionType]:
+#        """
+#        Get a list of random action that can be applied on the environments in the group
+#        :return: a list of random actions
+#        """
+#         return call_method_for_all(self.environments, 'get_random_action')
+#
+#     def set_goal(self, goal: GoalType) -> None:
+#         """
+#         Set the goal of each one of the environments in the group to be the given goal
+#         :param goal: a goal vector
+#         :return: None
+#         """
+#         # TODO: maybe enable setting multiple goals?
+#         call_method_for_all(self.environments, 'set_goal', goal)
--- a/rl_coach/environments/environment_interface.py
+++ b/rl_coach/environments/environment_interface.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union, Dict
+
+from rl_coach.spaces import ActionSpace
+
+from rl_coach.core_types import ActionType, EnvResponse, RunPhase
+
+
+class EnvironmentInterface(object):
+    def __init__(self):
+        self._phase = RunPhase.UNDEFINED
+
+    @property
+    def phase(self) -> RunPhase:
+        """
+        Get the phase of the environment
+        :return: the current phase
+        """
+        return self._phase
+
+    @phase.setter
+    def phase(self, val: RunPhase):
+        """
+        Change the phase of the environment
+        :param val: the new phase
+        :return: None
+        """
+        self._phase = val
+
+    @property
+    def action_space(self) -> Union[Dict[str, ActionSpace], ActionSpace]:
+        """
+        Get the action space of the environment (or of each of the agents wrapped in this environment.
+        i.e. in the LevelManager case")
+        :return: the action space
+        """
+        raise NotImplementedError("")
+
+    def get_random_action(self) -> ActionType:
+        """
+        Get a random action from the environment action space
+        :return: An action that follows the definition of the action space.
+        """
+        raise NotImplementedError("")
+
+    def step(self, action: ActionType) -> Union[None, EnvResponse]:
+        """
+        Make a single step in the environment using the given action
+        :param action: an action to use for stepping the environment. Should follow the definition of the action space.
+        :return: the environment response as returned in get_last_env_response or None for LevelManager
+        """
+        raise NotImplementedError("")
+
+    def reset_internal_state(self, force_environment_reset: bool=False) -> Union[None, EnvResponse]:
+        """
+        Reset the environment episode
+        :param force_environment_reset: in some cases, resetting the environment can be suppressed by the environment
+                                        itself. This flag allows force the reset.
+        :return: the environment response as returned in get_last_env_response or None for LevelManager
+        """
+        raise NotImplementedError("")
--- a/rl_coach/environments/gym_environment.py
+++ b/rl_coach/environments/gym_environment.py
@@ -0,0 +1,454 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import gym
+import numpy as np
+import scipy.ndimage
+
+from rl_coach.utils import lower_under_to_upper, short_dynamic_import
+
+try:
+    import roboschool
+    from OpenGL import GL
+except ImportError:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("RoboSchool")
+
+try:
+    from rl_coach.gym_extensions.continuous import mujoco
+except:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("GymExtensions")
+
+try:
+    import pybullet_envs
+except ImportError:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("PyBullet")
+
+from typing import Dict, Any, Union
+from rl_coach.core_types import RunPhase
+from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
+from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, ImageObservationSpace, VectorObservationSpace, \
+    StateSpace, RewardSpace
+from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
+from rl_coach.filters.reward.reward_clipping_filter import RewardClippingFilter
+from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
+from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
+from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
+from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
+from rl_coach.filters.filter import InputFilter
+import random
+from rl_coach.base_parameters import VisualizationParameters
+from rl_coach.logger import screen
+
+
+# Parameters
+
+class GymEnvironmentParameters(EnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.random_initialization_steps = 0
+        self.max_over_num_frames = 1
+        self.additional_simulator_parameters = None
+
+    @property
+    def path(self):
+        return 'rl_coach.environments.gym_environment:GymEnvironment'
+
+
+"""
+Roboschool Environment Components
+"""
+RoboSchoolInputFilters = NoInputFilter()
+RoboSchoolOutputFilters = NoOutputFilter()
+
+
+class Roboschool(GymEnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.frame_skip = 1
+        self.default_input_filter = RoboSchoolInputFilters
+        self.default_output_filter = RoboSchoolOutputFilters
+
+
+gym_roboschool_envs = ['inverted_pendulum', 'inverted_pendulum_swingup', 'inverted_double_pendulum', 'reacher',
+                       'hopper', 'walker2d', 'half_cheetah', 'ant', 'humanoid', 'humanoid_flagrun',
+                       'humanoid_flagrun_harder', 'pong']
+roboschool_v0 = {e: "{}".format(lower_under_to_upper(e) + '-v0') for e in gym_roboschool_envs}
+
+"""
+Mujoco Environment Components
+"""
+MujocoInputFilter = NoInputFilter()
+MujocoOutputFilter = NoOutputFilter()
+
+
+class Mujoco(GymEnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.frame_skip = 1
+        self.default_input_filter = MujocoInputFilter
+        self.default_output_filter = MujocoOutputFilter
+
+
+gym_mujoco_envs = ['inverted_pendulum', 'inverted_double_pendulum', 'reacher', 'hopper', 'walker2d', 'half_cheetah',
+                   'ant', 'swimmer', 'humanoid', 'humanoid_standup', 'pusher', 'thrower', 'striker']
+
+mujoco_v2 = {e: "{}".format(lower_under_to_upper(e) + '-v2') for e in gym_mujoco_envs}
+mujoco_v2['walker2d'] = 'Walker2d-v2'
+
+gym_fetch_envs = ['reach', 'slide', 'push', 'pick_and_place']
+fetch_v1 = {e: "{}".format('Fetch' + lower_under_to_upper(e) + '-v1') for e in gym_fetch_envs}
+
+"""
+Bullet Environment Components
+"""
+BulletInputFilter = NoInputFilter()
+BulletOutputFilter = NoOutputFilter()
+
+
+class Bullet(GymEnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.frame_skip = 1
+        self.default_input_filter = BulletInputFilter
+        self.default_output_filter = BulletOutputFilter
+
+
+"""
+Atari Environment Components
+"""
+
+AtariInputFilter = InputFilter(is_a_reference_filter=True)
+AtariInputFilter.add_reward_filter('clipping', RewardClippingFilter(-1.0, 1.0))
+AtariInputFilter.add_observation_filter('observation', 'rescaling',
+                                        ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]),
+                                                                                             high=255)))
+AtariInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
+AtariInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
+AtariInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(4))
+AtariOutputFilter = NoOutputFilter()
+
+
+class Atari(GymEnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.frame_skip = 4
+        self.max_over_num_frames = 2
+        self.random_initialization_steps = 30
+        self.default_input_filter = AtariInputFilter
+        self.default_output_filter = AtariOutputFilter
+
+
+gym_atari_envs = ['air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis',
+                  'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival',
+                  'centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk',
+                  'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar',
+                  'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kangaroo', 'krull', 'kung_fu_master',
+                  'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan',
+                  'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing',
+                  'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down',
+                  'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']
+atari_deterministic_v4 = {e: "{}".format(lower_under_to_upper(e) + 'Deterministic-v4') for e in gym_atari_envs}
+atari_no_frameskip_v4 = {e: "{}".format(lower_under_to_upper(e) + 'NoFrameskip-v4') for e in gym_atari_envs}
+
+
+class MaxOverFramesAndFrameskipEnvWrapper(gym.Wrapper):
+    def __init__(self, env, frameskip=4, max_over_num_frames=2):
+        super().__init__(env)
+        self.max_over_num_frames = max_over_num_frames
+        self.observations_stack = []
+        self.frameskip = frameskip
+        self.first_frame_to_max_over = self.frameskip - self.max_over_num_frames
+
+    def reset(self):
+        return self.env.reset()
+
+    def step(self, action):
+        total_reward = 0.0
+        done = None
+        info = None
+        self.observations_stack = []
+        for i in range(self.frameskip):
+            observation, reward, done, info = self.env.step(action)
+            if i >= self.first_frame_to_max_over:
+                self.observations_stack.append(observation)
+            total_reward += reward
+            if done:
+                # deal with last state in episode
+                if not self.observations_stack:
+                    self.observations_stack.append(observation)
+                break
+
+        max_over_frames_observation = np.max(self.observations_stack, axis=0)
+
+        return max_over_frames_observation, total_reward, done, info
+
+
+# Environment
+class GymEnvironment(Environment):
+    def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
+                 additional_simulator_parameters: Dict[str, Any] = None, seed: Union[None, int]=None,
+                 human_control: bool=False, custom_reward_threshold: Union[int, float]=None,
+                 random_initialization_steps: int=1, max_over_num_frames: int=1, **kwargs):
+        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold,
+                         visualization_parameters)
+
+        self.random_initialization_steps = random_initialization_steps
+        self.max_over_num_frames = max_over_num_frames
+        self.additional_simulator_parameters = additional_simulator_parameters
+
+        # hide warnings
+        gym.logger.set_level(40)
+
+        """
+        load and initialize environment
+        environment ids can be defined in 3 ways:
+        1. Native gym environments like BreakoutDeterministic-v0 for example
+        2. Custom gym environments written and installed as python packages.
+           This environments should have a python module with a class inheriting gym.Env, implementing the
+           relevant functions (_reset, _step, _render) and defining the observation and action space
+           For example: my_environment_package:MyEnvironmentClass will run an environment defined in the
+           MyEnvironmentClass class
+        3. Custom gym environments written as an independent module which is not installed.
+           This environments should have a python module with a class inheriting gym.Env, implementing the
+           relevant functions (_reset, _step, _render) and defining the observation and action space.
+           For example: path_to_my_environment.sub_directory.my_module:MyEnvironmentClass will run an
+           environment defined in the MyEnvironmentClass class which is located in the module in the relative path
+           path_to_my_environment.sub_directory.my_module
+        """
+        if ':' in self.env_id:
+            # custom environments
+            if '/' in self.env_id or '.' in self.env_id:
+                # environment in a an absolute path module written as a unix path or in a relative path module
+                # written as a python import path
+                env_class = short_dynamic_import(self.env_id)
+            else:
+                # environment in a python package
+                env_class = gym.envs.registration.load(self.env_id)
+
+            # instantiate the environment
+            if self.additional_simulator_parameters:
+                self.env = env_class(**self.additional_simulator_parameters)
+            else:
+                self.env = env_class()
+        else:
+            self.env = gym.make(self.env_id)
+
+        # for classic control we want to use the native renderer because otherwise we will get 2 renderer windows
+        environment_to_always_use_with_native_rendering = ['classic_control', 'mujoco', 'robotics']
+        self.native_rendering = self.native_rendering or \
+                                any([env in str(self.env.unwrapped.__class__)
+                                     for env in environment_to_always_use_with_native_rendering])
+        if self.native_rendering:
+            if hasattr(self, 'renderer'):
+                self.renderer.close()
+
+        # seed
+        if self.seed is not None:
+            self.env.seed(self.seed)
+            np.random.seed(self.seed)
+            random.seed(self.seed)
+
+        # frame skip and max between consecutive frames
+        self.is_robotics_env = 'robotics' in str(self.env.unwrapped.__class__)
+        self.is_mujoco_env = 'mujoco' in str(self.env.unwrapped.__class__)
+        self.is_atari_env = 'Atari' in str(self.env.unwrapped.__class__)
+        self.timelimit_env_wrapper = self.env
+        if self.is_atari_env:
+            self.env.unwrapped.frameskip = 1  # this accesses the atari env that is wrapped with a timelimit wrapper env
+            if self.env_id == "SpaceInvadersDeterministic-v4" and self.frame_skip == 4:
+                screen.warning("Warning: The frame-skip for Space Invaders was automatically updated from 4 to 3. "
+                               "This is following the DQN paper where it was noticed that a frame-skip of 3 makes the "
+                               "laser rays disappear. To force frame-skip of 4, please use SpaceInvadersNoFrameskip-v4.")
+                self.frame_skip = 3
+            self.env = MaxOverFramesAndFrameskipEnvWrapper(self.env,
+                                                           frameskip=self.frame_skip,
+                                                           max_over_num_frames=self.max_over_num_frames)
+        else:
+            self.env.unwrapped.frameskip = self.frame_skip
+
+        self.state_space = StateSpace({})
+
+        # observations
+        if not isinstance(self.env.observation_space, gym.spaces.dict_space.Dict):
+            state_space = {'observation': self.env.observation_space}
+        else:
+            state_space = self.env.observation_space.spaces
+
+        for observation_space_name, observation_space in state_space.items():
+            if len(observation_space.shape) == 3 and observation_space.shape[-1] == 3:
+                # we assume gym has image observations which are RGB and where their values are within 0-255
+                self.state_space[observation_space_name] = ImageObservationSpace(
+                    shape=np.array(observation_space.shape),
+                    high=255,
+                    channels_axis=-1
+                )
+            else:
+                self.state_space[observation_space_name] = VectorObservationSpace(
+                    shape=observation_space.shape[0],
+                    low=observation_space.low,
+                    high=observation_space.high
+                )
+        if 'desired_goal' in state_space.keys():
+            self.goal_space = self.state_space['desired_goal']
+
+        # actions
+        if type(self.env.action_space) == gym.spaces.box.Box:
+            self.action_space = BoxActionSpace(
+                shape=self.env.action_space.shape,
+                low=self.env.action_space.low,
+                high=self.env.action_space.high
+            )
+        elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
+            actions_description = []
+            if hasattr(self.env.unwrapped, 'get_action_meanings'):
+                actions_description = self.env.unwrapped.get_action_meanings()
+            self.action_space = DiscreteActionSpace(
+                num_actions=self.env.action_space.n,
+                descriptions=actions_description
+            )
+
+        if self.human_control:
+            # TODO: add this to the action space
+            # map keyboard keys to actions
+            self.key_to_action = {}
+            if hasattr(self.env.unwrapped, 'get_keys_to_action'):
+                self.key_to_action = self.env.unwrapped.get_keys_to_action()
+
+        # initialize the state by getting a new state from the environment
+        self.reset_internal_state(True)
+
+        # render
+        if self.is_rendered:
+            image = self.get_rendered_image()
+            scale = 1
+            if self.human_control:
+                scale = 2
+            if not self.native_rendering:
+                self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
+
+        # measurements
+        if self.env.spec is not None:
+            self.timestep_limit = self.env.spec.timestep_limit
+        else:
+            self.timestep_limit = None
+
+        # the info is only updated after the first step
+        self.state = self.step(self.action_space.default_action).next_state
+        self.state_space['measurements'] = VectorObservationSpace(shape=len(self.info.keys()))
+
+        if self.env.spec and custom_reward_threshold is None:
+                self.reward_success_threshold = self.env.spec.reward_threshold
+                self.reward_space = RewardSpace(1, reward_success_threshold=self.reward_success_threshold)
+
+    def _wrap_state(self, state):
+        if not isinstance(self.env.observation_space, gym.spaces.Dict):
+            return {'observation': state}
+        return state
+
+    def _update_state(self):
+        if self.is_atari_env and hasattr(self, 'current_ale_lives') \
+                and self.current_ale_lives != self.env.unwrapped.ale.lives():
+            if self.phase == RunPhase.TRAIN or self.phase == RunPhase.HEATUP:
+                # signal termination for life loss
+                self.done = True
+            elif self.phase == RunPhase.TEST and not self.done:
+                # the episode is not terminated in evaluation, but we need to press fire again
+                self._press_fire()
+            self._update_ale_lives()
+        # TODO: update the measurements
+        if self.state and "desired_goal" in self.state.keys():
+            self.goal = self.state['desired_goal']
+
+    def _take_action(self, action):
+        if type(self.action_space) == BoxActionSpace:
+            action = self.action_space.clip_action_to_space(action)
+
+        self.state, self.reward, self.done, self.info = self.env.step(action)
+        self.state = self._wrap_state(self.state)
+
+    def _random_noop(self):
+        # simulate a random initial environment state by stepping for a random number of times between 0 and 30
+        step_count = 0
+        random_initialization_steps = random.randint(0, self.random_initialization_steps)
+        while self.action_space is not None and (self.state is None or step_count < random_initialization_steps):
+            step_count += 1
+            self.step(self.action_space.default_action)
+
+    def _press_fire(self):
+        fire_action = 1
+        if self.is_atari_env and self.env.unwrapped.get_action_meanings()[fire_action] == 'FIRE':
+            self.current_ale_lives = self.env.unwrapped.ale.lives()
+            self.step(fire_action)
+            if self.done:
+                self.reset_internal_state()
+
+    def _update_ale_lives(self):
+        if self.is_atari_env:
+            self.current_ale_lives = self.env.unwrapped.ale.lives()
+
+    def _restart_environment_episode(self, force_environment_reset=False):
+        # prevent reset of environment if there are ale lives left
+        if (self.is_atari_env and self.env.unwrapped.ale.lives() > 0) \
+                and not force_environment_reset and not self.timelimit_env_wrapper._past_limit():
+            self.step(self.action_space.default_action)
+        else:
+            self.state = self.env.reset()
+            self.state = self._wrap_state(self.state)
+            self._update_ale_lives()
+
+        if self.is_atari_env:
+            self._random_noop()
+            self._press_fire()
+
+        # initialize the number of lives
+        self._update_ale_lives()
+
+    def _set_mujoco_camera(self, camera_idx: int):
+        """
+        This function can be used to set the camera for rendering the mujoco simulator
+        :param camera_idx: The index of the camera to use. Should be defined in the model
+        :return: None
+        """
+        if self.env.unwrapped.viewer.cam.fixedcamid != camera_idx and self.env.unwrapped.viewer._ncam > camera_idx:
+            from mujoco_py.generated import const
+            self.env.unwrapped.viewer.cam.type = const.CAMERA_FIXED
+            self.env.unwrapped.viewer.cam.fixedcamid = camera_idx
+
+    def _get_robotics_image(self):
+        self.env.render()
+        image = self.env.unwrapped._get_viewer().read_pixels(1600, 900, depth=False)[::-1, :, :]
+        image = scipy.misc.imresize(image, (270, 480, 3))
+        return image
+
+    def _render(self):
+        self.env.render(mode='human')
+        # required for setting up a fixed camera for mujoco
+        if self.is_mujoco_env:
+            self._set_mujoco_camera(0)
+
+    def get_rendered_image(self):
+        if self.is_robotics_env:
+            # necessary for fetch since the rendered image is cropped to an irrelevant part of the simulator
+            image = self._get_robotics_image()
+        else:
+            image = self.env.render(mode='rgb_array')
+        # required for setting up a fixed camera for mujoco
+        if self.is_mujoco_env:
+            self._set_mujoco_camera(0)
+        return image
--- a/rl_coach/environments/mujoco/init.py
+++ b/rl_coach/environments/mujoco/init.py
--- a/rl_coach/environments/mujoco/common/init.py
+++ b/rl_coach/environments/mujoco/common/init.py
@@ -0,0 +1,38 @@
+# Copyright 2017 The dm_control Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Functions to manage the common assets for domains."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from dm_control.utils import resources
+
+_SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
+_FILENAMES = [
+    "common/materials.xml",
+    "common/skybox.xml",
+    "common/visual.xml",
+]
+
+ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
+          for filename in _FILENAMES}
+
+
+def read_model(model_filename):
+  """Reads a model XML file and returns its contents as a string."""
+  return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))
--- a/rl_coach/environments/mujoco/common/materials.xml
+++ b/rl_coach/environments/mujoco/common/materials.xml
@@ -0,0 +1,22 @@
+<!--
+Common textures, colors and materials to be used throughout this suite. Some
+materials such as xxx_highlight are activated on occurence of certain events,
+for example receiving a positive reward.
+-->
+<mujoco>
+  <asset>
+    <texture name="grid" type="2d" builtin="checker" rgb1=".1 .2 .3" rgb2=".2 .3 .4" width="300" height="300" mark="edge" markrgb=".2 .3 .4"/>
+    <material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2"/>
+    <material name="self" rgba=".7 .5 .3 1"/>
+    <material name="self_default" rgba=".7 .5 .3 1"/>
+    <material name="self_highlight" rgba="0 .5 .3 1"/>
+    <material name="effector" rgba=".7 .4 .2 1"/>
+    <material name="effector_default" rgba=".7 .4 .2 1"/>
+    <material name="effector_highlight" rgba="0 .5 .3 1"/>
+    <material name="decoration" rgba=".3 .5 .7 1"/>
+    <material name="eye" rgba="0 .2 1 1"/>
+    <material name="target" rgba=".6 .3 .3 1"/>
+    <material name="target_default" rgba=".6 .3 .3 1"/>
+    <material name="target_highlight" rgba=".6 .3 .3 .4"/>
+  </asset>
+</mujoco>
--- a/rl_coach/environments/mujoco/common/skybox.xml
+++ b/rl_coach/environments/mujoco/common/skybox.xml
@@ -0,0 +1,6 @@
+<mujoco>
+  <asset>
+      <texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0"
+               width="800" height="800" mark="random" markrgb="1 1 1"/>
+  </asset>
+</mujoco>
--- a/rl_coach/environments/mujoco/common/visual.xml
+++ b/rl_coach/environments/mujoco/common/visual.xml
@@ -0,0 +1,7 @@
+<mujoco>
+  <visual>
+    <headlight ambient=".4 .4 .4" diffuse=".8 .8 .8" specular="0.1 0.1 0.1"/>
+    <map znear=".01"/>
+    <quality shadowsize="2048"/>
+  </visual>
+</mujoco>
--- a/rl_coach/environments/mujoco/pendulum_with_goals.py
+++ b/rl_coach/environments/mujoco/pendulum_with_goals.py
@@ -0,0 +1,185 @@
+import numpy as np
+import gym
+import os
+from gym import spaces
+from gym.envs.registration import EnvSpec
+
+from mujoco_py import load_model_from_path, MjSim , MjViewer, MjRenderContextOffscreen
+
+
+class PendulumWithGoals(gym.Env):
+    metadata = {
+        'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
+    }
+
+    def __init__(self, goal_reaching_thresholds=np.array([0.075, 0.075, 0.75]),
+                 goal_not_reached_penalty=-1, goal_reached_reward=0, terminate_on_goal_reaching=True,
+                 time_limit=1000, frameskip=1, random_goals_instead_of_standing_goal=False,
+                 polar_coordinates: bool=False):
+        super().__init__()
+        dir = os.path.dirname(__file__)
+        model = load_model_from_path(dir + "/pendulum_with_goals.xml")
+
+        self.sim = MjSim(model)
+        self.viewer = None
+        self.rgb_viewer = None
+
+        self.frameskip = frameskip
+        self.goal = None
+        self.goal_reaching_thresholds = goal_reaching_thresholds
+        self.goal_not_reached_penalty = goal_not_reached_penalty
+        self.goal_reached_reward = goal_reached_reward
+        self.terminate_on_goal_reaching = terminate_on_goal_reaching
+        self.time_limit = time_limit
+        self.current_episode_steps_counter = 0
+        self.random_goals_instead_of_standing_goal = random_goals_instead_of_standing_goal
+        self.polar_coordinates = polar_coordinates
+
+        # spaces definition
+        self.action_space = spaces.Box(low=-self.sim.model.actuator_ctrlrange[:, 1],
+                                       high=self.sim.model.actuator_ctrlrange[:, 1],
+                                       dtype=np.float32)
+        if self.polar_coordinates:
+            self.observation_space = spaces.Dict({
+                "observation": spaces.Box(low=np.array([-np.pi, -15]),
+                                          high=np.array([np.pi, 15]),
+                                          dtype=np.float32),
+                "desired_goal": spaces.Box(low=np.array([-np.pi, -15]),
+                                           high=np.array([np.pi, 15]),
+                                           dtype=np.float32),
+                "achieved_goal": spaces.Box(low=np.array([-np.pi, -15]),
+                                            high=np.array([np.pi, 15]),
+                                            dtype=np.float32)
+            })
+        else:
+            self.observation_space = spaces.Dict({
+                "observation": spaces.Box(low=np.array([-1, -1, -15]),
+                                          high=np.array([1, 1, 15]),
+                                          dtype=np.float32),
+                "desired_goal": spaces.Box(low=np.array([-1, -1, -15]),
+                                           high=np.array([1, 1, 15]),
+                                           dtype=np.float32),
+                "achieved_goal": spaces.Box(low=np.array([-1, -1, -15]),
+                                            high=np.array([1, 1, 15]),
+                                            dtype=np.float32)
+            })
+
+        self.spec = EnvSpec('PendulumWithGoals-v0')
+        self.spec.reward_threshold = self.goal_not_reached_penalty * self.time_limit
+
+        self.reset()
+
+    def _goal_reached(self):
+        observation = self._get_obs()
+        if np.any(np.abs(observation['achieved_goal'] - observation['desired_goal']) > self.goal_reaching_thresholds):
+            return False
+        else:
+            return True
+
+    def _terminate(self):
+        if (self._goal_reached() and self.terminate_on_goal_reaching) or \
+                        self.current_episode_steps_counter >= self.time_limit:
+            return True
+        else:
+            return False
+
+    def _reward(self):
+        if self._goal_reached():
+            return self.goal_reached_reward
+        else:
+            return self.goal_not_reached_penalty
+
+    def step(self, action):
+        self.sim.data.ctrl[:] = action
+        for _ in range(self.frameskip):
+            self.sim.step()
+
+        self.current_episode_steps_counter += 1
+
+        state = self._get_obs()
+
+        # visualize the angular velocities
+        state_velocity = np.copy(state['observation'][-1] / 20)
+        goal_velocity = self.goal[-1] / 20
+        self.sim.model.site_size[2] = np.array([0.01, 0.01, state_velocity])
+        self.sim.data.mocap_pos[2] = np.array([0.85, 0, 0.75 + state_velocity])
+        self.sim.model.site_size[3] = np.array([0.01, 0.01, goal_velocity])
+        self.sim.data.mocap_pos[3] = np.array([1.15, 0, 0.75 + goal_velocity])
+
+        return state, self._reward(), self._terminate(), {}
+
+    def _get_obs(self):
+
+        """
+        y
+
+        ^
+        |____
+        |   /
+        |  /
+        |~/
+        |/
+        --------> x
+
+        """
+
+        # observation
+        angle = self.sim.data.qpos
+        angular_velocity = self.sim.data.qvel
+        if self.polar_coordinates:
+            observation = np.concatenate([angle - np.pi, angular_velocity])
+        else:
+            x = np.sin(angle)
+            y = np.cos(angle)  # qpos is the angle relative to a standing pole
+            observation = np.concatenate([x, y, angular_velocity])
+
+        return {
+            "observation": observation,
+            "desired_goal": self.goal,
+            "achieved_goal": observation
+        }
+
+    def reset(self):
+        self.current_episode_steps_counter = 0
+
+        # set initial state
+        angle = np.random.uniform(np.pi / 4, 7 * np.pi / 4)
+        angular_velocity = np.random.uniform(-0.05, 0.05)
+        self.sim.data.qpos[0] = angle
+        self.sim.data.qvel[0] = angular_velocity
+        self.sim.step()
+
+        # goal
+        if self.random_goals_instead_of_standing_goal:
+            angle_target = np.random.uniform(-np.pi / 8, np.pi / 8)
+            angular_velocity_target = np.random.uniform(-0.2, 0.2)
+        else:
+            angle_target = 0
+            angular_velocity_target = 0
+
+        # convert target values to goal
+        x_target = np.sin(angle_target)
+        y_target = np.cos(angle_target)
+        if self.polar_coordinates:
+            self.goal = np.array([angle_target - np.pi, angular_velocity_target])
+        else:
+            self.goal = np.array([x_target, y_target, angular_velocity_target])
+
+        # visualize the goal
+        self.sim.data.mocap_pos[0] = [x_target, 0, y_target]
+
+        return self._get_obs()
+
+    def render(self, mode='human', close=False):
+        if mode == 'human':
+            if self.viewer is None:
+                self.viewer = MjViewer(self.sim)
+            self.viewer.render()
+        elif mode == 'rgb_array':
+            if self.rgb_viewer is None:
+                self.rgb_viewer = MjRenderContextOffscreen(self.sim, 0)
+            self.rgb_viewer.render(500, 500)
+            # window size used for old mujoco-py:
+            data = self.rgb_viewer.read_pixels(500, 500, depth=False)
+            # original image is upside-down, so flip it
+            return data[::-1, :, :]
--- a/rl_coach/environments/mujoco/pendulum_with_goals.xml
+++ b/rl_coach/environments/mujoco/pendulum_with_goals.xml
@@ -0,0 +1,42 @@
+<mujoco model="pendulum_with_goals">
+  <include file="./common/visual.xml"/>
+  <include file="./common/skybox.xml"/>
+  <include file="./common/materials.xml"/>
+
+  <option timestep="0.002">
+    <flag contact="disable" energy="enable"/>
+  </option>
+
+  <worldbody>
+    <light name="light" pos="0 0 2"/>
+    <geom name="floor" size="2 2 .2" type="plane" material="grid"/>
+    <camera name="fixed" pos="0 -1.5 2" xyaxes='1 0 0 0 1 1'/>
+    <camera name="lookat" mode="targetbodycom" target="pole" pos="0 -2 1"/>
+    <body name="pole" pos="0 0 .6">
+      <joint name="hinge" type="hinge" axis="0 1 0" damping="0.1"/>
+      <geom name="base" material="decoration" type="cylinder" fromto="0 -.03 0 0 .03 0" size="0.021" mass="0"/>
+      <geom name="pole" material="self" type="capsule" fromto="0 0 0 0 0 0.5" size="0.02" mass="0"/>
+      <geom name="mass" material="effector" type="sphere" pos="0 0 0.5" size="0.05" mass="1"/>
+    </body>
+
+    <body name="end_goal" pos="0 0 0" mocap="true">
+        <site type="sphere" size="0.05" rgba="1 1 0 1" />
+    </body>
+    <!--<body name="sub_goal" pos="0 0 0" mocap="true">-->
+        <!--<site type="sphere" size="0.05" rgba="1 0 1 1" />-->
+    <!--</body>-->
+    <body name="current_velo" pos="0.0 0 0.0" mocap="true">
+        <site type="box" size="0.01 0.01 0.1" rgba="1 1 1 1" />
+    </body>
+    <body name="subgoal_velo" pos="0.0 0 0.0" mocap="true">
+        <site type="box" size="0.01 0.01 0.1" rgba="1 0 1 1" />
+    </body>
+    <body name="zero_velo" pos="1.0 0 0.75" mocap="true">
+        <site type="box" size="0.3 0.01 0.01" rgba="1 0 0 1" />
+    </body>
+  </worldbody>
+
+  <actuator>
+    <motor name="torque" joint="hinge" gear="1" ctrlrange="-2 2" ctrllimited="true"/>
+  </actuator>
+</mujoco>
--- a/rl_coach/environments/starcraft2_environment.py
+++ b/rl_coach/environments/starcraft2_environment.py
@@ -0,0 +1,245 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from enum import Enum
+from typing import Union, List
+
+import numpy as np
+from rl_coach.filters.observation.observation_move_axis_filter import ObservationMoveAxisFilter
+
+try:
+    from pysc2 import maps
+    from pysc2.env import sc2_env
+    from pysc2.env import available_actions_printer
+    from pysc2.lib import actions
+    from pysc2.lib import features
+    from pysc2.env import environment
+    from absl import app
+    from absl import flags
+except ImportError:
+    from rl_coach.logger import failed_imports
+    failed_imports.append("PySc2")
+
+from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
+from rl_coach.base_parameters import VisualizationParameters
+from rl_coach.spaces import BoxActionSpace, VectorObservationSpace, PlanarMapsObservationSpace, StateSpace, CompoundActionSpace, \
+    DiscreteActionSpace
+from rl_coach.filters.filter import InputFilter, OutputFilter
+from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
+from rl_coach.filters.action.linear_box_to_box_map import LinearBoxToBoxMap
+from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
+
+FLAGS = flags.FLAGS
+FLAGS(['coach.py'])
+
+SCREEN_SIZE = 84  # will also impact the action space size
+
+# Starcraft Constants
+_NOOP = actions.FUNCTIONS.no_op.id
+_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id
+_SELECT_ARMY = actions.FUNCTIONS.select_army.id
+_PLAYER_RELATIVE = features.SCREEN_FEATURES.player_relative.index
+_NOT_QUEUED = [0]
+_SELECT_ALL = [0]
+
+
+class StarcraftObservationType(Enum):
+    Features = 0
+    RGB = 1
+
+
+StarcraftInputFilter = InputFilter(is_a_reference_filter=True)
+StarcraftInputFilter.add_observation_filter('screen', 'move_axis', ObservationMoveAxisFilter(0, -1))
+StarcraftInputFilter.add_observation_filter('screen', 'rescaling',
+                                            ObservationRescaleToSizeFilter(
+                                                PlanarMapsObservationSpace(np.array([84, 84, 1]),
+                                                                           low=0, high=255, channels_axis=-1)))
+StarcraftInputFilter.add_observation_filter('screen', 'to_uint8', ObservationToUInt8Filter(0, 255))
+
+StarcraftInputFilter.add_observation_filter('minimap', 'move_axis', ObservationMoveAxisFilter(0, -1))
+StarcraftInputFilter.add_observation_filter('minimap', 'rescaling',
+                                            ObservationRescaleToSizeFilter(
+                                                PlanarMapsObservationSpace(np.array([64, 64, 1]),
+                                                                           low=0, high=255, channels_axis=-1)))
+StarcraftInputFilter.add_observation_filter('minimap', 'to_uint8', ObservationToUInt8Filter(0, 255))
+
+
+StarcraftNormalizingOutputFilter = OutputFilter(is_a_reference_filter=True)
+StarcraftNormalizingOutputFilter.add_action_filter(
+    'normalization', LinearBoxToBoxMap(input_space_low=-SCREEN_SIZE / 2, input_space_high=SCREEN_SIZE / 2 - 1))
+
+
+class StarCraft2EnvironmentParameters(EnvironmentParameters):
+    def __init__(self):
+        super().__init__()
+        self.screen_size = 84
+        self.minimap_size = 64
+        self.feature_minimap_maps_to_use = range(7)
+        self.feature_screen_maps_to_use = range(17)
+        self.observation_type = StarcraftObservationType.Features
+        self.disable_fog = False
+        self.auto_select_all_army = True
+        self.default_input_filter = StarcraftInputFilter
+        self.default_output_filter = StarcraftNormalizingOutputFilter
+        self.use_full_action_space = False
+
+
+    @property
+    def path(self):
+        return 'rl_coach.environments.starcraft2_environment:StarCraft2Environment'
+
+
+# Environment
+class StarCraft2Environment(Environment):
+    def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
+                 seed: Union[None, int]=None, human_control: bool=False,
+                 custom_reward_threshold: Union[int, float]=None,
+                 screen_size: int=84, minimap_size: int=64,
+                 feature_minimap_maps_to_use: List=range(7), feature_screen_maps_to_use: List=range(17),
+                 observation_type: StarcraftObservationType=StarcraftObservationType.Features,
+                 disable_fog: bool=False, auto_select_all_army: bool=True,
+                 use_full_action_space: bool=False, **kwargs):
+        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
+
+        self.screen_size = screen_size
+        self.minimap_size = minimap_size
+        self.feature_minimap_maps_to_use = feature_minimap_maps_to_use
+        self.feature_screen_maps_to_use = feature_screen_maps_to_use
+        self.observation_type = observation_type
+        self.features_screen_size = None
+        self.feature_minimap_size = None
+        self.rgb_screen_size = None
+        self.rgb_minimap_size = None
+        if self.observation_type == StarcraftObservationType.Features:
+            self.features_screen_size = screen_size
+            self.feature_minimap_size = minimap_size
+        elif self.observation_type == StarcraftObservationType.RGB:
+            self.rgb_screen_size = screen_size
+            self.rgb_minimap_size = minimap_size
+        self.disable_fog = disable_fog
+        self.auto_select_all_army = auto_select_all_army
+        self.use_full_action_space = use_full_action_space
+
+        # step_mul is the equivalent to frame skipping. Not sure if it repeats actions in between or not though.
+        self.env = sc2_env.SC2Env(map_name=self.env_id, step_mul=frame_skip,
+                                  visualize=self.is_rendered,
+                                  agent_interface_format=sc2_env.AgentInterfaceFormat(
+                                      feature_dimensions=sc2_env.Dimensions(
+                                          screen=self.features_screen_size,
+                                          minimap=self.feature_minimap_size
+                                      )
+                                      # rgb_dimensions=sc2_env.Dimensions(
+                                      #     screen=self.rgb_screen_size,
+                                      #     minimap=self.rgb_screen_size
+                                      # )
+                                  ),
+                                  # feature_screen_size=self.features_screen_size,
+                                  # feature_minimap_size=self.feature_minimap_size,
+                                  # rgb_screen_size=self.rgb_screen_size,
+                                  # rgb_minimap_size=self.rgb_screen_size,
+                                  disable_fog=disable_fog,
+                                  random_seed=self.seed
+                                  )
+
+        # print all the available actions
+        # self.env = available_actions_printer.AvailableActionsPrinter(self.env)
+
+        self.reset_internal_state(True)
+
+        """
+        feature_screen:  [height_map, visibility_map, creep, power, player_id, player_relative, unit_type, selected,
+                          unit_hit_points, unit_hit_points_ratio, unit_energy, unit_energy_ratio, unit_shields, 
+                          unit_shields_ratio, unit_density, unit_density_aa, effects]
+        feature_minimap: [height_map, visibility_map, creep, camera, player_id, player_relative, selecte
+        d]
+        player:          [player_id, minerals, vespene, food_cap, food_army, food_workers, idle_worker_dount, 
+                          army_count, warp_gate_count, larva_count]
+        """
+        self.screen_shape = np.array(self.env.observation_spec()[0]['feature_screen'])
+        self.screen_shape[0] = len(self.feature_screen_maps_to_use)
+        self.minimap_shape = np.array(self.env.observation_spec()[0]['feature_minimap'])
+        self.minimap_shape[0] = len(self.feature_minimap_maps_to_use)
+        self.state_space = StateSpace({
+            "screen": PlanarMapsObservationSpace(shape=self.screen_shape, low=0, high=255, channels_axis=0),
+            "minimap": PlanarMapsObservationSpace(shape=self.minimap_shape, low=0, high=255, channels_axis=0),
+            "measurements": VectorObservationSpace(self.env.observation_spec()[0]["player"][0])
+        })
+        if self.use_full_action_space:
+            action_identifiers = list(self.env.action_spec()[0].functions)
+            num_action_identifiers = len(action_identifiers)
+            action_arguments = [(arg.name, arg.sizes) for arg in self.env.action_spec()[0].types]
+            sub_action_spaces = [DiscreteActionSpace(num_action_identifiers)]
+            for argument in action_arguments:
+                for dimension in argument[1]:
+                    sub_action_spaces.append(DiscreteActionSpace(dimension))
+            self.action_space = CompoundActionSpace(sub_action_spaces)
+        else:
+            self.action_space = BoxActionSpace(2, 0, self.screen_size - 1, ["X-Axis, Y-Axis"],
+                                               default_action=np.array([self.screen_size/2, self.screen_size/2]))
+
+    def _update_state(self):
+        timestep = 0
+        self.screen = self.last_result[timestep].observation.feature_screen
+        # extract only the requested segmentation maps from the observation
+        self.screen = np.take(self.screen, self.feature_screen_maps_to_use, axis=0)
+        self.minimap = self.last_result[timestep].observation.feature_minimap
+        self.measurements = self.last_result[timestep].observation.player
+        self.reward = self.last_result[timestep].reward
+        self.done = self.last_result[timestep].step_type == environment.StepType.LAST
+        self.state = {
+            'screen': self.screen,
+            'minimap': self.minimap,
+            'measurements': self.measurements
+        }
+
+    def _take_action(self, action):
+        if self.use_full_action_space:
+            action_identifier = action[0]
+            action_arguments = action[1:]
+            action = actions.FunctionCall(action_identifier, action_arguments)
+        else:
+            coord = np.array(action[0:2])
+            noop = False
+            coord = coord.round()
+            coord = np.clip(coord, 0, SCREEN_SIZE - 1)
+            self.last_action_idx = coord
+
+            if noop:
+                action = actions.FunctionCall(_NOOP, [])
+            else:
+                action = actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
+
+        self.last_result = self.env.step(actions=[action])
+
+    def _restart_environment_episode(self, force_environment_reset=False):
+        # reset the environment
+        self.last_result = self.env.reset()
+
+        # select all the units on the screen
+        if self.auto_select_all_army:
+            self.env.step(actions=[actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
+
+    def get_rendered_image(self):
+        screen = np.squeeze(np.tile(np.expand_dims(self.screen, -1), (1, 1, 3)))
+        screen = screen / np.max(screen) * 255
+        return screen.astype('uint8')
+
+    def dump_video_of_last_episode(self):
+        from rl_coach.logger import experiment_path
+        self.env._run_config.replay_dir = experiment_path
+        self.env.save_replay('replays')
+        super().dump_video_of_last_episode()
--- a/rl_coach/environments/toy_problems/init.py
+++ b/rl_coach/environments/toy_problems/init.py
--- a/rl_coach/environments/toy_problems/bit_flip.py
+++ b/rl_coach/environments/toy_problems/bit_flip.py
@@ -0,0 +1,82 @@
+import numpy as np
+import gym
+from gym import spaces
+import random
+
+
+class BitFlip(gym.Env):
+    metadata = {
+        'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
+    }
+
+    def __init__(self, bit_length=16, max_steps=None, mean_zero=False):
+        super(BitFlip, self).__init__()
+        if bit_length < 1:
+            raise ValueError('bit_length must be >= 1, found {}'.format(bit_length))
+        self.bit_length = bit_length
+        self.mean_zero = mean_zero
+
+        if max_steps is None:
+            # default to bit_length
+            self.max_steps = bit_length
+        elif max_steps == 0:
+            self.max_steps = None
+        else:
+            self.max_steps = max_steps
+
+        # spaces documentation: https://gym.openai.com/docs/
+        self.action_space = spaces.Discrete(bit_length)
+        self.observation_space = spaces.Dict({
+            'state': spaces.Box(low=0, high=1, shape=(bit_length, )),
+            'desired_goal': spaces.Box(low=0, high=1, shape=(bit_length, )),
+            'achieved_goal': spaces.Box(low=0, high=1, shape=(bit_length, ))
+        })
+
+        self.reset()
+
+    def _terminate(self):
+        return (self.state == self.goal).all() or self.steps >= self.max_steps
+
+    def _reward(self):
+        return -1 if (self.state != self.goal).any() else 0
+
+    def step(self, action):
+        # action is an int in the range [0, self.bit_length)
+        self.state[action] = int(not self.state[action])
+        self.steps += 1
+
+        return (self._get_obs(), self._reward(), self._terminate(), {})
+
+    def reset(self):
+        self.steps = 0
+
+        self.state = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])
+
+        # make sure goal is not the initial state
+        self.goal = self.state
+        while (self.goal == self.state).all():
+            self.goal = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])
+
+        return self._get_obs()
+
+    def _mean_zero(self, x):
+        if self.mean_zero:
+            return (x - 0.5) / 0.5
+        else:
+            return x
+
+    def _get_obs(self):
+        return {
+            'state': self._mean_zero(self.state),
+            'desired_goal': self._mean_zero(self.goal),
+            'achieved_goal': self._mean_zero(self.state)
+        }
+
+    def render(self, mode='human', close=False):
+        observation = np.zeros((20, 20 * self.bit_length, 3))
+        for bit_idx, (state_bit, goal_bit) in enumerate(zip(self.state, self.goal)):
+            # green if the bit matches
+            observation[:, bit_idx * 20:(bit_idx + 1) * 20, 1] = (state_bit == goal_bit) * 255
+            # red if the bit doesn't match
+            observation[:, bit_idx * 20:(bit_idx + 1) * 20, 0] = (state_bit != goal_bit) * 255
+        return observation
--- a/Show More
+++ b/Show More