coach v0.8.0

2026-02-10 18:45:51 +01:00 · 2017-10-19 13:10:15 +03:00
parent 7f77813a39
commit 1d4c3455e7
123 changed files with 10996 additions and 203 deletions
--- a/agents/init.py
+++ b/agents/init.py
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.actor_critic_agent import *
+from agents.agent import *
+from agents.bootstrapped_dqn_agent import *
+from agents.clipped_ppo_agent import *
+from agents.ddpg_agent import *
+from agents.ddqn_agent import *
+from agents.dfp_agent import *
+from agents.dqn_agent import *
+from agents.distributional_dqn_agent import *
+from agents.mmc_agent import *
+from agents.n_step_q_agent import *
+from agents.naf_agent import *
+from agents.nec_agent import *
+from agents.pal_agent import *
+from agents.policy_gradients_agent import *
+from agents.policy_optimization_agent import *
+from agents.ppo_agent import *
+from agents.value_optimization_agent import *
--- a/agents/actor_critic_agent.py
+++ b/agents/actor_critic_agent.py
@@ -0,0 +1,136 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.policy_optimization_agent import *
+from logger import *
+from utils import *
+import scipy.signal
+
+
+# Actor Critic - https://arxiv.org/abs/1602.01783
+class ActorCriticAgent(PolicyOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
+        PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
+        self.last_gradient_update_step_idx = 0
+        self.action_advantages = Signal('Advantages')
+        self.state_values = Signal('Values')
+        self.unclipped_grads = Signal('Grads (unclipped)')
+        self.signals.append(self.action_advantages)
+        self.signals.append(self.state_values)
+        self.signals.append(self.unclipped_grads)
+
+    # Discounting function used to calculate discounted returns.
+    def discount(self, x, gamma):
+        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+    def get_general_advantage_estimation_values(self, rewards, values):
+        # values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
+        bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
+
+        # Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
+        # although in practice works even in much smaller Tmax values, e.g. 20)
+        deltas = rewards + self.tp.agent.discount * values[1:] - values[:-1]
+        gae = self.discount(deltas, self.tp.agent.discount * self.tp.agent.gae_lambda)
+
+        if self.tp.agent.estimate_value_using_gae:
+            discounted_returns = np.expand_dims(gae + values[:-1], -1)
+        else:
+            discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
+                                                                       self.tp.agent.discount)), 1)[:-1]
+        return gae, discounted_returns
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # get the values for the current states
+        result = self.main_network.online_network.predict(current_states)
+        current_state_values = result[0]
+        self.state_values.add_sample(current_state_values)
+
+        # the targets for the state value estimator
+        num_transitions = len(game_overs)
+        state_value_head_targets = np.zeros((num_transitions, 1))
+
+        # estimate the advantage function
+        action_advantages = np.zeros((num_transitions, 1))
+
+        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
+            if game_overs[-1]:
+                R = 0
+            else:
+                R = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0]
+
+            for i in reversed(range(num_transitions)):
+                R = rewards[i] + self.tp.agent.discount * R
+                state_value_head_targets[i] = R
+                action_advantages[i] = R - current_state_values[i]
+
+        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
+            # get bootstraps
+            bootstrapped_value = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0]
+            values = np.append(current_state_values, bootstrapped_value)
+            if game_overs[-1]:
+                values[-1] = 0
+
+            # get general discounted returns table
+            gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values)
+            action_advantages = np.vstack(gae_values)
+        else:
+            screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        action_advantages = action_advantages.squeeze(axis=-1)
+        if not self.env.discrete_controls and len(actions.shape) < 2:
+            actions = np.expand_dims(actions, -1)
+
+        # train
+        result = self.main_network.online_network.accumulate_gradients([current_states, actions],
+                                                                       [state_value_head_targets, action_advantages])
+
+        # logging
+        total_loss, losses, unclipped_grads = result[:3]
+        self.action_advantages.add_sample(action_advantages)
+        self.unclipped_grads.add_sample(unclipped_grads)
+        logger.create_signal_value('Value Loss', losses[0])
+        logger.create_signal_value('Policy Loss', losses[1])
+
+        return total_loss
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        if self.env.discrete_controls:
+            # DISCRETE
+            state_value, action_probabilities = self.main_network.online_network.predict(observation)
+            action_probabilities = action_probabilities.squeeze()
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_probabilities)
+            else:
+                action = np.argmax(action_probabilities)
+            action_info = {"action_probability": action_probabilities[action], "state_value": state_value}
+            self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities)))
+        else:
+            # CONTINUOUS
+            state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(observation)
+            action_values_mean = action_values_mean.squeeze()
+            action_values_std = action_values_std.squeeze()
+            if phase == RunPhase.TRAIN:
+                action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
+            else:
+                action = action_values_mean
+            action_info = {"action_probability": action, "state_value": state_value}
+
+        return action, action_info
--- a/agents/agent.py
+++ b/agents/agent.py
@@ -0,0 +1,536 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import scipy.ndimage
+import matplotlib.pyplot as plt
+import copy
+from configurations import Preset
+from collections import OrderedDict
+from utils import RunPhase, Signal, is_empty, RunningStat
+from architectures import *
+from exploration_policies import *
+from memories import *
+from memories.memory import *
+from logger import logger, screen
+import random
+import time
+import os
+import itertools
+from architectures.tensorflow_components.shared_variables import SharedRunningStats
+from six.moves import range
+
+
+class Agent:
+    def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0):
+        """
+        :param env: An environment instance
+        :type env: EnvironmentWrapper
+        :param tuning_parameters: A Preset class instance with all the running paramaters
+        :type tuning_parameters: Preset
+        :param replicated_device: A tensorflow device for distributed training (optional)
+        :type replicated_device: instancemethod
+        :param thread_id: The current thread id
+        :param thread_id: int
+        """
+
+        screen.log_title("Creating agent {}".format(task_id))
+        self.task_id = task_id
+        self.sess = tuning_parameters.sess
+        self.env = tuning_parameters.env_instance = env
+
+        # i/o dimensions
+        if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height:
+            tuning_parameters.env.desired_observation_width = self.env.width
+            tuning_parameters.env.desired_observation_height = self.env.height
+        self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size
+        self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size
+        if tuning_parameters.agent.use_accumulated_reward_as_measurement:
+            self.measurements_size = tuning_parameters.env.measurements_size = (self.measurements_size[0] + 1,)
+
+        # modules
+        self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
+        # self.architecture = eval(tuning_parameters.architecture)
+
+        self.has_global = replicated_device is not None
+        self.replicated_device = replicated_device
+        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0"
+
+        self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)')
+        self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy
+                                                  + '(tuning_parameters)')
+        self.evaluation_exploration_policy.change_phase(RunPhase.TEST)
+
+        # initialize all internal variables
+        self.tp = tuning_parameters
+        self.in_heatup = False
+        self.total_reward_in_current_episode = 0
+        self.total_steps_counter = 0
+        self.running_reward = None
+        self.training_iteration = 0
+        self.current_episode = 0
+        self.curr_state = []
+        self.current_episode_steps_counter = 0
+        self.episode_running_info = {}
+        self.last_episode_evaluation_ran = 0
+        self.running_observations = []
+        logger.set_current_time(self.current_episode)
+        self.main_network = None
+        self.networks = []
+        self.last_episode_images = []
+
+        # signals
+        self.signals = []
+        self.loss = Signal('Loss')
+        self.signals.append(self.loss)
+        self.curr_learning_rate = Signal('Learning Rate')
+        self.signals.append(self.curr_learning_rate)
+
+        if self.tp.env.normalize_observation and not self.env.is_state_type_image:
+            if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
+                self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,))
+                self.running_reward_stats = RunningStat(())
+            else:
+                self.running_observation_stats = SharedRunningStats(self.tp, replicated_device,
+                                                                    shape=(self.tp.env.desired_observation_width,),
+                                                                    name='observation_stats')
+                self.running_reward_stats = SharedRunningStats(self.tp, replicated_device,
+                                                               shape=(),
+                                                               name='reward_stats')
+
+        # env is already reset at this point. Otherwise we're getting an error where you cannot
+        # reset an env which is not done
+        self.reset_game(do_not_reset_env=True)
+
+        # use seed
+        if self.tp.seed is not None:
+            random.seed(self.tp.seed)
+            np.random.seed(self.tp.seed)
+
+    def log_to_screen(self, phase):
+        # log to screen
+        if self.current_episode > 0:
+            if phase == RunPhase.TEST:
+                exploration = self.evaluation_exploration_policy.get_control_param()
+            else:
+                exploration = self.exploration_policy.get_control_param()
+            screen.log_dict(
+                OrderedDict([
+                    ("Worker", self.task_id),
+                    ("Episode", self.current_episode),
+                    ("total reward", self.total_reward_in_current_episode),
+                    ("exploration", exploration),
+                    ("steps", self.total_steps_counter),
+                    ("training iteration", self.training_iteration)
+                ]),
+                prefix="Heatup" if self.in_heatup else "Training" if phase == RunPhase.TRAIN else "Testing"
+            )
+
+    def update_log(self, phase=RunPhase.TRAIN):
+        """
+        Writes logging messages to screen and updates the log file with all the signal values.
+        :return: None
+        """
+        # log all the signals to file
+        logger.set_current_time(self.current_episode)
+        logger.create_signal_value('Training Iter', self.training_iteration)
+        logger.create_signal_value('In Heatup', int(self.in_heatup))
+        logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
+        logger.create_signal_value('ER #Episodes', self.memory.length())
+        logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
+        logger.create_signal_value('Total steps', self.total_steps_counter)
+        logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
+        if phase == RunPhase.TRAIN:
+            logger.create_signal_value("Training Reward", self.total_reward_in_current_episode)
+        elif phase == RunPhase.TEST:
+            logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode)
+        logger.update_wall_clock_time(self.current_episode)
+
+        for signal in self.signals:
+            logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
+            logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
+            logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
+            logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
+
+        # dump
+        if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0:
+            logger.dump_output_csv()
+
+    def reset_game(self, do_not_reset_env=False):
+        """
+        Resets all the episodic parameters and start a new environment episode.
+        :param do_not_reset_env: A boolean that allows prevention of environment reset
+        :return: None
+        """
+
+        for signal in self.signals:
+            signal.reset()
+        self.total_reward_in_current_episode = 0
+        self.curr_state = []
+        self.last_episode_images = []
+        self.current_episode_steps_counter = 0
+        self.episode_running_info = {}
+        if not do_not_reset_env:
+            self.env.reset()
+        self.exploration_policy.reset()
+
+        # required for online plotting
+        if self.tp.visualization.plot_action_values_online:
+            if hasattr(self, 'episode_running_info') and hasattr(self.env, 'actions_description'):
+                for action in self.env.actions_description:
+                    self.episode_running_info[action] = []
+            plt.clf()
+        if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
+            for network in self.networks:
+                network.curr_rnn_c_in = network.middleware_embedder.c_init
+                network.curr_rnn_h_in = network.middleware_embedder.h_init
+
+    def stack_observation(self, curr_stack, observation):
+        """
+        Adds a new observation to an existing stack of observations from previous time-steps.
+        :param curr_stack: The current observations stack.
+        :param observation: The new observation
+        :return: The updated observation stack
+        """
+
+        if curr_stack == []:
+            # starting an episode
+            curr_stack = np.vstack(np.expand_dims([observation] * self.tp.env.observation_stack_size, 0))
+            curr_stack = self.switch_axes_order(curr_stack, from_type='channels_first', to_type='channels_last')
+        else:
+            curr_stack = np.append(curr_stack, np.expand_dims(np.squeeze(observation), axis=-1), axis=-1)
+            curr_stack = np.delete(curr_stack, 0, -1)
+
+        return curr_stack
+
+    def preprocess_observation(self, observation):
+        """
+        Preprocesses the given observation. 
+        For images - convert to grayscale, resize and convert to int. 
+        For measurements vectors - normalize by a running average and std.
+        :param observation: The agents observation
+        :return: A processed version of the observation
+        """
+
+        if self.env.is_state_type_image:
+            # rescale
+            observation = scipy.misc.imresize(observation,
+                                              (self.tp.env.desired_observation_height,
+                                               self.tp.env.desired_observation_width),
+                                              interp=self.tp.rescaling_interpolation_type)
+            # rgb to y
+            if len(observation.shape) > 2 and observation.shape[2] > 1:
+                r, g, b = observation[:, :, 0], observation[:, :, 1], observation[:, :, 2]
+                observation = 0.2989 * r + 0.5870 * g + 0.1140 * b
+
+            return observation.astype('uint8')
+        else:
+            if self.tp.env.normalize_observation:
+                # standardize the input observation using a running mean and std
+                if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
+                    self.running_observation_stats.push(observation)
+                observation = (observation - self.running_observation_stats.mean) / \
+                              (self.running_observation_stats.std + 1e-15)
+                observation = np.clip(observation, -5.0, 5.0)
+            return observation
+
+    def learn_from_batch(self, batch):
+        """
+        Given a batch of transitions, calculates their target values and updates the network.
+        :param batch: A list of transitions
+        :return: The loss of the training
+        """
+        pass
+
+    def train(self):
+        """
+        A single training iteration. Sample a batch, train on it and update target networks.
+        :return: The training loss.
+        """
+        batch = self.memory.sample(self.tp.batch_size)
+        loss = self.learn_from_batch(batch)
+
+        if self.tp.learning_rate_decay_rate != 0:
+            self.curr_learning_rate.add_sample(self.tp.sess.run(self.tp.learning_rate))
+        else:
+            self.curr_learning_rate.add_sample(self.tp.learning_rate)
+
+        # update the target network of every network that has a target network
+        if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
+            for network in self.networks:
+                network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
+            logger.create_signal_value('Update Target Network', 1)
+        else:
+            logger.create_signal_value('Update Target Network', 0, overwrite=False)
+
+        return loss
+
+    def extract_batch(self, batch):
+        """
+        Extracts a single numpy array for each object in a batch of transitions (state, action, etc.)
+        :param batch: An array of transitions
+        :return: For each transition element, returns a numpy array of all the transitions in the batch
+        """
+
+        current_observations = np.array([transition.state['observation'] for transition in batch])
+        next_observations = np.array([transition.next_state['observation'] for transition in batch])
+        actions = np.array([transition.action for transition in batch])
+        rewards = np.array([transition.reward for transition in batch])
+        game_overs = np.array([transition.game_over for transition in batch])
+        total_return = np.array([transition.total_return for transition in batch])
+
+        current_states = current_observations
+        next_states = next_observations
+
+        # get the entire state including measurements if available
+        if self.tp.agent.use_measurements:
+            current_measurements = np.array([transition.state['measurements'] for transition in batch])
+            next_measurements = np.array([transition.next_state['measurements'] for transition in batch])
+            current_states = [current_observations, current_measurements]
+            next_states = [next_observations, next_measurements]
+
+        return current_states, next_states, actions, rewards, game_overs, total_return
+
+    def plot_action_values_online(self):
+        """
+        Plot an animated graph of the value of each possible action during the episode
+        :return: None
+        """
+
+        plt.clf()
+        for key, data_list in self.episode_running_info.items():
+            plt.plot(data_list, label=key)
+        plt.legend()
+        plt.pause(0.00000001)
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        """
+        choose an action to act with in the current episode being played. Different behavior might be exhibited when training
+         or testing.
+         
+        :param curr_state: the current state to act upon.  
+        :param phase: the current phase: training or testing.
+        :return: chosen action, some action value describing the action (q-value, probability, etc)
+        """
+        pass
+
+    def preprocess_reward(self, reward):
+        if self.tp.env.reward_scaling:
+            reward /= float(self.tp.env.reward_scaling)
+        if self.tp.env.reward_clipping_max:
+            reward = min(reward, self.tp.env.reward_clipping_max)
+        if self.tp.env.reward_clipping_min:
+            reward = max(reward, self.tp.env.reward_clipping_min)
+        return reward
+
+    def switch_axes_order(self, observation, from_type='channels_first', to_type='channels_last'):
+        """
+        transpose an observation axes from channels_first to channels_last or vice versa
+        :param observation: a numpy array 
+        :param from_type: can be 'channels_first' or 'channels_last'
+        :param to_type: can be 'channels_first' or 'channels_last'
+        :return: a new observation with the requested axes order
+        """
+        if from_type == to_type:
+            return
+        assert 2 <= len(observation.shape) <= 3, 'num axes of an observation must be 2 for a vector or 3 for an image'
+        assert type(observation) == np.ndarray, 'observation must be a numpy array'
+        if len(observation.shape) == 3:
+            if from_type == 'channels_first' and to_type == 'channels_last':
+                return np.transpose(observation, (1, 2, 0))
+            elif from_type == 'channels_last' and to_type == 'channels_first':
+                return np.transpose(observation, (2, 0, 1))
+        else:
+            return np.transpose(observation, (1, 0))
+
+    def act(self, phase=RunPhase.TRAIN):
+        """
+        Take one step in the environment according to the network prediction and store the transition in memory
+        :param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
+        :return: A boolean value that signals an episode termination
+        """
+
+        self.total_steps_counter += 1
+        self.current_episode_steps_counter += 1
+
+        # get new action
+        action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0}
+        is_first_transition_in_episode = (self.curr_state == [])
+        if is_first_transition_in_episode:
+            observation = self.preprocess_observation(self.env.observation)
+            observation = self.stack_observation([], observation)
+
+            self.curr_state = {'observation': observation}
+            if self.tp.agent.use_measurements:
+                self.curr_state['measurements'] = self.env.measurements
+                if self.tp.agent.use_accumulated_reward_as_measurement:
+                    self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
+
+        if self.in_heatup:  # we do not have a stacked curr_state yet
+            action = self.env.get_random_action()
+        else:
+            action, action_info = self.choose_action(self.curr_state, phase=phase)
+
+        # perform action
+        if type(action) == np.ndarray:
+            action = action.squeeze()
+        result = self.env.step(action)
+        shaped_reward = self.preprocess_reward(result['reward'])
+        if 'action_intrinsic_reward' in action_info.keys():
+            shaped_reward += action_info['action_intrinsic_reward']
+        self.total_reward_in_current_episode += result['reward']
+        observation = self.preprocess_observation(result['observation'])
+
+        # plot action values online
+        if self.tp.visualization.plot_action_values_online and not self.in_heatup:
+            self.plot_action_values_online()
+
+        # initialize the next state
+        observation = self.stack_observation(self.curr_state['observation'], observation)
+
+        next_state = {'observation': observation}
+        if self.tp.agent.use_measurements and 'measurements' in result.keys():
+            next_state['measurements'] = result['measurements']
+            if self.tp.agent.use_accumulated_reward_as_measurement:
+                next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
+
+        # store the transition only if we are training
+        if phase == RunPhase.TRAIN:
+            transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
+            for key in action_info.keys():
+                transition.info[key] = action_info[key]
+            if self.tp.agent.add_a_normalized_timestep_to_the_observation:
+                transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit
+            self.memory.store(transition)
+        elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs:
+            # we store the transitions only for saving gifs
+            self.last_episode_images.append(self.env.get_rendered_image())
+
+        # update the current state for the next step
+        self.curr_state = next_state
+
+        # deal with episode termination
+        if result['done']:
+            if self.tp.visualization.dump_csv:
+                self.update_log(phase=phase)
+            self.log_to_screen(phase=phase)
+
+            if phase == RunPhase.TRAIN:
+                self.reset_game()
+
+            self.current_episode += 1
+
+        # return episode really ended
+        return result['done']
+
+    def evaluate(self, num_episodes, keep_networks_synced=False):
+        """
+        Run in an evaluation mode for several episodes. Actions will be chosen greedily.
+        :param keep_networks_synced: keep the online network in sync with the global network after every episode
+        :param num_episodes: The number of episodes to evaluate on
+        :return: None
+        """
+
+        max_reward_achieved = -float('inf')
+        average_evaluation_reward = 0
+        screen.log_title("Running evaluation")
+        self.env.change_phase(RunPhase.TEST)
+        for i in range(num_episodes):
+            # keep the online network in sync with the global network
+            if keep_networks_synced:
+                for network in self.networks:
+                    network.sync()
+
+            episode_ended = False
+            while not episode_ended:
+                episode_ended = self.act(phase=RunPhase.TEST)
+
+            if self.tp.visualization.dump_gifs and self.total_reward_in_current_episode > max_reward_achieved:
+                max_reward_achieved = self.total_reward_in_current_episode
+                frame_skipping = int(5/self.tp.env.frame_skip)
+                logger.create_gif(self.last_episode_images[::frame_skipping],
+                                  name='score-{}'.format(max_reward_achieved), fps=10)
+
+            average_evaluation_reward += self.total_reward_in_current_episode
+            self.reset_game()
+
+        average_evaluation_reward /= float(num_episodes)
+
+        self.env.change_phase(RunPhase.TRAIN)
+        screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
+
+    def post_training_commands(self):
+        pass
+
+    def improve(self):
+        """
+        Training algorithms wrapper. Heatup >> [ Evaluate >> Play >> Train >> Save checkpoint ]
+
+        :return: None
+        """
+
+        # synchronize the online network weights with the global network
+        for network in self.networks:
+            network.sync()
+
+        # heatup phase
+        if self.tp.num_heatup_steps != 0:
+            self.in_heatup = True
+            screen.log_title("Starting heatup {}".format(self.task_id))
+            num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
+            for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
+                self.act()
+
+        # training phase
+        self.in_heatup = False
+        screen.log_title("Starting training {}".format(self.task_id))
+        self.exploration_policy.change_phase(RunPhase.TRAIN)
+        training_start_time = time.time()
+        model_snapshots_periods_passed = -1
+
+        while self.training_iteration < self.tp.num_training_iterations:
+            # evaluate
+            evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \
+                             (self.current_episode % self.tp.evaluate_every_x_episodes == 0)
+            if evaluate_agent:
+                self.last_episode_evaluation_ran = self.current_episode
+                self.evaluate(self.tp.evaluation_episodes)
+
+            # snapshot model
+            if self.tp.save_model_sec and self.tp.save_model_sec > 0 and not self.tp.distributed:
+                total_training_time = time.time() - training_start_time
+                current_snapshot_period = (int(total_training_time) // self.tp.save_model_sec)
+                if current_snapshot_period > model_snapshots_periods_passed:
+                    model_snapshots_periods_passed = current_snapshot_period
+                    self.main_network.save_model(model_snapshots_periods_passed)
+
+            # play and record in replay buffer
+            if self.tp.agent.step_until_collecting_full_episodes:
+                step = 0
+                while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(-1).length() != 0:
+                    self.act()
+                    step += 1
+            else:
+                for step in range(self.tp.agent.num_consecutive_playing_steps):
+                    self.act()
+
+            # train
+            if self.tp.train:
+                for step in range(self.tp.agent.num_consecutive_training_steps):
+                    loss = self.train()
+                    self.loss.add_sample(loss)
+                    self.training_iteration += 1
+                self.post_training_commands()
+
--- a/agents/bootstrapped_dqn_agent.py
+++ b/agents/bootstrapped_dqn_agent.py
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
+class BootstrappedDQNAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+    def reset_game(self, do_not_reset_env=False):
+        ValueOptimizationAgent.reset_game(self, do_not_reset_env)
+        self.exploration_policy.select_head()
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # for the action we actually took, the error is:
+        # TD error = r + discount*max(q_st_plus_1) - q_st
+        # for all other actions, the error is 0
+        q_st_plus_1 = self.main_network.target_network.predict(next_states)
+        # initialize with the current prediction so that we will
+        TD_targets = self.main_network.online_network.predict(current_states)
+
+        #  only update the action that we have actually done in this transition
+        for i in range(self.tp.batch_size):
+            mask = batch[i].info['mask']
+            for head_idx in range(self.tp.exploration.architecture_num_q_heads):
+                if mask[head_idx] == 1:
+                    TD_targets[head_idx][i, actions[i]] = rewards[i] + \
+                                                          (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(
+                                                              q_st_plus_1[head_idx][i], 0)
+
+        result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+
+        total_loss = result[0]
+
+        return total_loss
+
+    def act(self, phase=RunPhase.TRAIN):
+        ValueOptimizationAgent.act(self, phase)
+        mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability,
+                                  self.tp.exploration.architecture_num_q_heads)
+        self.memory.update_last_transition_info({'mask': mask})
--- a/agents/clipped_ppo_agent.py
+++ b/agents/clipped_ppo_agent.py
@@ -0,0 +1,210 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.actor_critic_agent import *
+from random import shuffle
+import tensorflow as tf
+
+
+# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
+class ClippedPPOAgent(ActorCriticAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
+                                  create_target_network=True)
+        # signals definition
+        self.value_loss = Signal('Value Loss')
+        self.signals.append(self.value_loss)
+        self.policy_loss = Signal('Policy Loss')
+        self.signals.append(self.policy_loss)
+        self.total_kl_divergence_during_training_process = 0.0
+        self.unclipped_grads = Signal('Grads (unclipped)')
+        self.signals.append(self.unclipped_grads)
+        self.value_targets = Signal('Value Targets')
+        self.signals.append(self.value_targets)
+        self.kl_divergence = Signal('KL Divergence')
+        self.signals.append(self.kl_divergence)
+
+    def fill_advantages(self, batch):
+        current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
+
+        current_state_values = self.main_network.online_network.predict([current_states])[0]
+        current_state_values = current_state_values.squeeze()
+        self.state_values.add_sample(current_state_values)
+
+        # calculate advantages
+        advantages = []
+        value_targets = []
+        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
+            advantages = total_return - current_state_values
+        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
+            # get bootstraps
+            episode_start_idx = 0
+            advantages = np.array([])
+            value_targets = np.array([])
+            for idx, game_over in enumerate(game_overs):
+                if game_over:
+                    # get advantages for the rollout
+                    value_bootstrapping = np.zeros((1,))
+                    rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
+
+                    rollout_advantages, gae_based_value_targets = \
+                        self.get_general_advantage_estimation_values(rewards[episode_start_idx:idx+1],
+                                                                     rollout_state_values)
+                    episode_start_idx = idx + 1
+                    advantages = np.append(advantages, rollout_advantages)
+                    value_targets = np.append(value_targets, gae_based_value_targets)
+        else:
+            screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        # standardize
+        advantages = (advantages - np.mean(advantages)) / np.std(advantages)
+
+        for transition, advantage, value_target in zip(batch, advantages, value_targets):
+            transition.info['advantage'] = advantage
+            transition.info['gae_based_value_target'] = value_target
+
+        self.action_advantages.add_sample(advantages)
+
+    def train_network(self, dataset, epochs):
+        loss = []
+        for j in range(epochs):
+            loss = {
+                'total_loss': [],
+                'policy_losses': [],
+                'unclipped_grads': [],
+                'fetch_result': []
+            }
+            shuffle(dataset)
+            for i in range(int(len(dataset) / self.tp.batch_size)):
+                batch = dataset[i * self.tp.batch_size:(i + 1) * self.tp.batch_size]
+                current_states, _, actions, _, _, total_return = self.extract_batch(batch)
+
+                advantages = np.array([t.info['advantage'] for t in batch])
+                gae_based_value_targets = np.array([t.info['gae_based_value_target'] for t in batch])
+                if not self.tp.env_instance.discrete_controls and len(actions.shape) == 1:
+                    actions = np.expand_dims(actions, -1)
+
+                # get old policy probabilities and distribution
+                result = self.main_network.target_network.predict([current_states])
+                old_policy_distribution = result[1:]
+
+                # calculate gradients and apply on both the local policy network and on the global policy network
+                fetches = [self.main_network.online_network.output_heads[1].kl_divergence,
+                           self.main_network.online_network.output_heads[1].entropy]
+
+                total_return = np.expand_dims(total_return, -1)
+                value_targets = gae_based_value_targets if self.tp.agent.estimate_value_using_gae else total_return
+                total_loss, policy_losses, unclipped_grads, fetch_result =\
+                    self.main_network.online_network.accumulate_gradients(
+                        [current_states] + [actions] + old_policy_distribution,
+                        [total_return, advantages], additional_fetches=fetches)
+
+                self.value_targets.add_sample(value_targets)
+                if self.tp.distributed:
+                    self.main_network.apply_gradients_to_global_network()
+                    self.main_network.update_online_network()
+                else:
+                    self.main_network.apply_gradients_to_online_network()
+
+                self.main_network.online_network.reset_accumulated_gradients()
+
+                loss['total_loss'].append(total_loss)
+                loss['policy_losses'].append(policy_losses)
+                loss['unclipped_grads'].append(unclipped_grads)
+                loss['fetch_result'].append(fetch_result)
+
+                self.unclipped_grads.add_sample(unclipped_grads)
+
+            for key in loss.keys():
+                loss[key] = np.mean(loss[key], 0)
+
+            if self.tp.learning_rate_decay_rate != 0:
+                curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
+                self.curr_learning_rate.add_sample(curr_learning_rate)
+            else:
+                curr_learning_rate = self.tp.learning_rate
+
+            # log training parameters
+            screen.log_dict(
+                OrderedDict([
+                    ("Surrogate loss", loss['policy_losses'][0]),
+                    ("KL divergence", loss['fetch_result'][0]),
+                    ("Entropy", loss['fetch_result'][1]),
+                    ("training epoch", j),
+                    ("learning_rate", curr_learning_rate)
+                ]),
+                prefix="Policy training"
+            )
+
+        self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
+        self.entropy.add_sample(loss['fetch_result'][1])
+        self.kl_divergence.add_sample(loss['fetch_result'][0])
+        return policy_losses
+
+    def post_training_commands(self):
+
+        # clean memory
+        self.memory.clean()
+
+    def train(self):
+        self.main_network.sync()
+
+        dataset = self.memory.transitions
+
+        self.fill_advantages(dataset)
+
+        # take only the requested number of steps
+        dataset = dataset[:self.tp.agent.num_consecutive_playing_steps]
+
+        if self.tp.distributed and self.tp.agent.share_statistics_between_workers:
+            self.running_observation_stats.push(np.array([t.state['observation'] for t in dataset]))
+
+        losses = self.train_network(dataset, 10)
+        self.value_loss.add_sample(losses[0])
+        self.policy_loss.add_sample(losses[1])
+        self.update_log()  # should be done in order to update the data that has been accumulated * while not playing *
+        return np.append(losses[0], losses[1])
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = curr_state['observation']
+        observation = np.expand_dims(np.array(observation), 0)
+
+        if self.env.discrete_controls:
+            # DISCRETE
+            _, action_values = self.main_network.online_network.predict(observation)
+            action_values = action_values.squeeze()
+
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_values)
+            else:
+                action = np.argmax(action_values)
+            action_info = {"action_probability": action_values[action]}
+            # self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
+        else:
+            # CONTINUOUS
+            _, action_values_mean, action_values_std = self.main_network.online_network.predict(observation)
+            action_values_mean = action_values_mean.squeeze()
+            action_values_std = action_values_std.squeeze()
+            if phase == RunPhase.TRAIN:
+                action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
+                # if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5:
+                #     print action
+            else:
+                action = action_values_mean
+            action_info = {"action_probability": action_values_mean}
+
+        return action, action_info
--- a/agents/ddpg_agent.py
+++ b/agents/ddpg_agent.py
@@ -0,0 +1,104 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.actor_critic_agent import *
+from configurations import *
+
+
+# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
+class DDPGAgent(ActorCriticAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
+                                  create_target_network=True)
+        # define critic network
+        self.critic_network = self.main_network
+        # self.networks.append(self.critic_network)
+
+        # define actor network
+        tuning_parameters.agent.input_types = [InputTypes.Observation]
+        tuning_parameters.agent.output_types = [OutputTypes.Pi]
+        self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
+                                            self.replicated_device, self.worker_device)
+        self.networks.append(self.actor_network)
+
+        self.q_values = Signal("Q")
+        self.signals.append(self.q_values)
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # TD error = r + discount*max(q_st_plus_1) - q_st
+        next_actions = self.actor_network.target_network.predict([next_states])
+        q_st_plus_1 = self.critic_network.target_network.predict([next_states, next_actions])
+        TD_targets = np.expand_dims(rewards, -1) + \
+                     (1.0 - np.expand_dims(game_overs, -1)) * self.tp.agent.discount * q_st_plus_1
+
+        # get the gradients of the critic output with respect to the action
+        actions_mean = self.actor_network.online_network.predict(current_states)
+        critic_online_network = self.critic_network.online_network
+        action_gradients = self.critic_network.sess.run(critic_online_network.gradients_wrt_inputs[1],
+                                                        feed_dict={
+                                                            critic_online_network.inputs[0]: current_states,
+                                                            critic_online_network.inputs[1]: actions_mean,
+                                                        })[0]
+
+        # train the critic
+        if len(actions.shape) == 1:
+            actions = np.expand_dims(actions, -1)
+        result = self.critic_network.train_and_sync_networks([current_states, actions], TD_targets)
+        total_loss = result[0]
+
+        # apply the gradients from the critic to the actor
+        actor_online_network = self.actor_network.online_network
+        gradients = self.actor_network.sess.run(actor_online_network.weighted_gradients,
+                                                feed_dict={
+                                                    actor_online_network.gradients_weights_ph: -action_gradients,
+                                                    actor_online_network.inputs[0]: current_states
+                                                })
+        if self.actor_network.has_global:
+            self.actor_network.global_network.apply_gradients(gradients)
+            self.actor_network.update_online_network()
+        else:
+            self.actor_network.online_network.apply_gradients(gradients)
+
+        return total_loss
+
+    def train(self):
+        return Agent.train(self)
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        assert not self.env.discrete_controls, 'DDPG works only for continuous control problems'
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        result = self.actor_network.online_network.predict(observation)
+        action_values = result[0].squeeze()
+
+        if phase == RunPhase.TRAIN:
+            action = self.exploration_policy.get_action(action_values)
+        else:
+            action = action_values
+
+        action = np.clip(action, self.env.action_space_low, self.env.action_space_high)
+
+        # get q value
+        action_batch = np.expand_dims(action, 0)
+        if type(action) != np.ndarray:
+            action_batch = np.array([[action]])
+        q_value = self.critic_network.online_network.predict([observation, action_batch])[0]
+        self.q_values.add_sample(q_value)
+        action_info = {"action_value": q_value}
+
+        return action, action_info
--- a/agents/ddqn_agent.py
+++ b/agents/ddqn_agent.py
@@ -0,0 +1,42 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Double DQN - https://arxiv.org/abs/1509.06461
+class DDQNAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1)
+        q_st_plus_1 = self.main_network.target_network.predict(next_states)
+        TD_targets = self.main_network.online_network.predict(current_states)
+
+        # initialize with the current prediction so that we will
+        #  only update the action that we have actually done in this transition
+        for i in range(self.tp.batch_size):
+            TD_targets[i, actions[i]] = rewards[i] \
+                                        + (1.0 - game_overs[i]) * self.tp.agent.discount * q_st_plus_1[i][
+                selected_actions[i]]
+
+        result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+        total_loss = result[0]
+
+        return total_loss
--- a/agents/dfp_agent.py
+++ b/agents/dfp_agent.py
@@ -0,0 +1,83 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.agent import *
+
+
+# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
+class DFPAgent(Agent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.current_goal = self.tp.agent.goal_vector
+        self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
+                                           self.replicated_device, self.worker_device)
+        self.networks.append(self.main_network)
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
+
+        # create the inputs for the network
+        input = current_states
+        input.append(np.repeat(np.expand_dims(self.current_goal, 0), self.tp.batch_size, 0))
+
+        # get the current outputs of the network
+        targets = self.main_network.online_network.predict(input)
+
+        # change the targets for the taken actions
+        for i in range(self.tp.batch_size):
+            targets[i, actions[i]] = batch[i].info['future_measurements'].flatten()
+
+        result = self.main_network.train_and_sync_networks(current_states, targets)
+        total_loss = result[0]
+
+        return total_loss
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
+        goal = np.expand_dims(self.current_goal, 0)
+
+        # predict the future measurements
+        measurements_future_prediction = self.main_network.online_network.predict([observation, measurements, goal])[0]
+        action_values = np.zeros((self.action_space_size,))
+        num_steps_used_for_objective = len(self.tp.agent.future_measurements_weights)
+
+        # calculate the score of each action by multiplying it's future measurements with the goal vector
+        for action_idx in range(self.action_space_size):
+            action_measurements = measurements_future_prediction[action_idx]
+            action_measurements = np.reshape(action_measurements,
+                                             (self.tp.agent.num_predicted_steps_ahead, self.measurements_size[0]))
+            future_steps_values = np.dot(action_measurements, self.current_goal)
+            action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
+                                               self.tp.agent.future_measurements_weights)
+
+        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
+        if phase == RunPhase.TRAIN:
+            action = self.exploration_policy.get_action(action_values)
+        else:
+            action = np.argmax(action_values)
+
+        action_values = action_values.squeeze()
+
+        # store information for plotting interactively (actual plotting is done in agent)
+        if self.tp.visualization.plot_action_values_online:
+            for idx, action_name in enumerate(self.env.actions_description):
+                self.episode_running_info[action_name].append(action_values[idx])
+
+        action_info = {"action_probability": 0, "action_value": action_values[action]}
+
+        return action, action_info
--- a/agents/distributional_dqn_agent.py
+++ b/agents/distributional_dqn_agent.py
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
+class DistributionalDQNAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
+
+    # prediction's format is (batch,actions,atoms)
+    def get_q_values(self, prediction):
+        return np.dot(prediction, self.z_values)
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # for the action we actually took, the error is calculated by the atoms distribution
+        # for all other actions, the error is 0
+        distributed_q_st_plus_1 = self.main_network.target_network.predict(next_states)
+        # initialize with the current prediction so that we will
+        TD_targets = self.main_network.online_network.predict(current_states)
+
+        # only update the action that we have actually done in this transition
+        target_actions = np.argmax(self.get_q_values(distributed_q_st_plus_1), axis=1)
+        m = np.zeros((self.tp.batch_size, self.z_values.size))
+
+        batches = np.arange(self.tp.batch_size)
+        for j in range(self.z_values.size):
+            tzj = np.fmax(np.fmin(rewards + (1.0 - game_overs) * self.tp.agent.discount * self.z_values[j],
+                                self.z_values[self.z_values.size - 1]),
+                                self.z_values[0])
+            bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
+            u = (np.ceil(bj)).astype(int)
+            l = (np.floor(bj)).astype(int)
+            m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
+            m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
+        # total_loss = cross entropy between actual result above and predicted result for the given action
+        TD_targets[batches, actions] = m
+
+        result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+        total_loss = result[0]
+
+        return total_loss
+
--- a/agents/dqn_agent.py
+++ b/agents/dqn_agent.py
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
+class DQNAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # for the action we actually took, the error is:
+        # TD error = r + discount*max(q_st_plus_1) - q_st
+        # for all other actions, the error is 0
+        q_st_plus_1 = self.main_network.target_network.predict(next_states)
+        # initialize with the current prediction so that we will
+        TD_targets = self.main_network.online_network.predict(current_states)
+
+        #  only update the action that we have actually done in this transition
+        for i in range(self.tp.batch_size):
+            TD_targets[i, actions[i]] = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(
+                q_st_plus_1[i], 0)
+
+        result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+        total_loss = result[0]
+
+        return total_loss
--- a/agents/mmc_agent.py
+++ b/agents/mmc_agent.py
@@ -0,0 +1,42 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+class MixedMonteCarloAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
+
+        TD_targets = self.main_network.online_network.predict(current_states)
+        selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1)
+        q_st_plus_1 = self.main_network.target_network.predict(next_states)
+        # initialize with the current prediction so that we will
+        #  only update the action that we have actually done in this transition
+        for i in range(self.tp.batch_size):
+            one_step_target = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * q_st_plus_1[i][
+                selected_actions[i]]
+            monte_carlo_target = total_return[i]
+            TD_targets[i, actions[i]] = (1 - self.mixing_rate) * one_step_target + self.mixing_rate * monte_carlo_target
+
+        result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+        total_loss = result[0]
+
+        return total_loss
--- a/agents/n_step_q_agent.py
+++ b/agents/n_step_q_agent.py
@@ -0,0 +1,85 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+from agents.policy_optimization_agent import *
+from logger import *
+from utils import *
+import scipy.signal
+
+
+# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
+class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
+        self.last_gradient_update_step_idx = 0
+        self.q_values = Signal('Q Values')
+        self.unclipped_grads = Signal('Grads (unclipped)')
+        self.signals.append(self.q_values)
+        self.signals.append(self.unclipped_grads)
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # get the values for the current states
+        state_value_head_targets = self.main_network.online_network.predict(current_states)
+
+        # the targets for the state value estimator
+        num_transitions = len(game_overs)
+
+        if self.tp.agent.targets_horizon == '1-Step':
+            # 1-Step Q learning
+            q_st_plus_1 = self.main_network.target_network.predict(next_states)
+
+            for i in reversed(xrange(num_transitions)):
+                state_value_head_targets[i][actions[i]] = \
+                    rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(q_st_plus_1[i], 0)
+
+        elif self.tp.agent.targets_horizon == 'N-Step':
+            # N-Step Q learning
+            if game_overs[-1]:
+                R = 0
+            else:
+                R = np.max(self.main_network.target_network.predict(np.expand_dims(next_states[-1], 0)))
+
+            for i in reversed(xrange(num_transitions)):
+                R = rewards[i] + self.tp.agent.discount * R
+                state_value_head_targets[i][actions[i]] = R
+
+        else:
+            assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
+
+        # train
+        result = self.main_network.online_network.accumulate_gradients([current_states], [state_value_head_targets])
+
+        # logging
+        total_loss, losses, unclipped_grads = result[:3]
+        self.unclipped_grads.add_sample(unclipped_grads)
+        logger.create_signal_value('Value Loss', losses[0])
+
+        return total_loss
+
+    def train(self):
+        # update the target network of every network that has a target network
+        if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
+            for network in self.networks:
+                network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
+            logger.create_signal_value('Update Target Network', 1)
+        else:
+            logger.create_signal_value('Update Target Network', 0, overwrite=False)
+
+        return PolicyOptimizationAgent.train(self)
--- a/agents/naf_agent.py
+++ b/agents/naf_agent.py
@@ -0,0 +1,75 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
+class NAFAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.l_values = Signal("L")
+        self.a_values = Signal("Advantage")
+        self.mu_values = Signal("Action")
+        self.v_values = Signal("V")
+        self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values]
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+        # TD error = r + discount*v_st_plus_1 - q_st
+        v_st_plus_1 = self.main_network.sess.run(self.main_network.target_network.output_heads[0].V,
+                                                 feed_dict={self.main_network.target_network.inputs[0]: next_states})
+        TD_targets = np.expand_dims(rewards, -1) + (1.0 - np.expand_dims(game_overs, -1)) * self.tp.agent.discount * v_st_plus_1
+
+        if len(actions.shape) == 1:
+            actions = np.expand_dims(actions, -1)
+
+        result = self.main_network.train_and_sync_networks([current_states, actions], TD_targets)
+        total_loss = result[0]
+
+        return total_loss
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        assert not self.env.discrete_controls, 'NAF works only for continuous control problems'
+
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        naf_head = self.main_network.online_network.output_heads[0]
+        action_values = self.main_network.sess.run(naf_head.mu,
+                                            feed_dict={self.main_network.online_network.inputs[0]: observation})
+        if phase == RunPhase.TRAIN:
+            action = self.exploration_policy.get_action(action_values)
+        else:
+            action = action_values
+
+        Q, L, A, mu, V = self.main_network.sess.run(
+            [naf_head.Q, naf_head.L, naf_head.A, naf_head.mu, naf_head.V],
+            feed_dict={
+                self.main_network.online_network.inputs[0]: observation,
+                self.main_network.online_network.inputs[1]: action_values
+            }
+        )
+
+        # store the q values statistics for logging
+        self.q_values.add_sample(Q)
+        self.l_values.add_sample(L)
+        self.a_values.add_sample(A)
+        self.mu_values.add_sample(mu)
+        self.v_values.add_sample(V)
+
+        action_value = {"action_value": Q}
+        return action, action_value
--- a/agents/nec_agent.py
+++ b/agents/nec_agent.py
@@ -0,0 +1,104 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
+class NECAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
+                                        create_target_network=False)
+        self.current_episode_state_embeddings = []
+        self.current_episode_actions = []
+        self.training_started = False
+
+    def learn_from_batch(self, batch):
+        if not self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn):
+            return 0
+        else:
+            if not self.training_started:
+                self.training_started = True
+                screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
+
+        current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
+        result = self.main_network.train_and_sync_networks([current_states, actions], total_return)
+        total_loss = result[0]
+
+        return total_loss
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+
+        # get embedding
+        embedding = self.main_network.sess.run(self.main_network.online_network.state_embedding,
+                                               feed_dict={self.main_network.online_network.inputs[0]: observation})
+        self.current_episode_state_embeddings.append(embedding[0])
+
+        # get action values
+        if self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn):
+            # if there are enough entries in the DND then we can query it to get the action values
+            actions_q_values = []
+            for action in range(self.action_space_size):
+                feed_dict = {
+                    self.main_network.online_network.state_embedding: embedding,
+                    self.main_network.online_network.output_heads[0].input[0]: np.array([action])
+                }
+                q_value = self.main_network.sess.run(
+                    self.main_network.online_network.output_heads[0].output, feed_dict=feed_dict)
+                actions_q_values.append(q_value[0])
+        else:
+            # get only the embedding so we can insert it to the DND
+            actions_q_values = [0] * self.action_space_size
+
+        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
+        if phase == RunPhase.TRAIN:
+            action = self.exploration_policy.get_action(actions_q_values)
+            self.current_episode_actions.append(action)
+        else:
+            action = np.argmax(actions_q_values)
+
+        # store the q values statistics for logging
+        self.q_values.add_sample(actions_q_values)
+
+        # store information for plotting interactively (actual plotting is done in agent)
+        if self.tp.visualization.plot_action_values_online:
+            for idx, action_name in enumerate(self.env.actions_description):
+                self.episode_running_info[action_name].append(actions_q_values[idx])
+
+        action_value = {"action_value": actions_q_values[action]}
+        return action, action_value
+
+    def reset_game(self, do_not_reset_env=False):
+        ValueOptimizationAgent.reset_game(self, do_not_reset_env)
+
+        # make sure we already have at least one episode
+        if self.memory.num_complete_episodes() >= 1 and not self.in_heatup:
+            # get the last full episode that we have collected
+            episode = self.memory.get(-2)
+            returns = []
+            for i in range(episode.length()):
+                returns.append(episode.get_transition(i).total_return)
+            # Just to deal with the end of heatup where there might be a case where it ends in a middle
+            # of an episode, and thus when getting the episode out of the ER, it will be a complete one whereas
+            # the other statistics collected here, are collected only during training.
+            returns = returns[-len(self.current_episode_actions):]
+            self.main_network.online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
+                                                                     self.current_episode_actions, returns)
+
+        self.current_episode_state_embeddings = []
+        self.current_episode_actions = []
--- a/agents/pal_agent.py
+++ b/agents/pal_agent.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
+class PALAgent(ValueOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.alpha = tuning_parameters.agent.pal_alpha
+        self.persistent = tuning_parameters.agent.persistent_advantage_learning
+        self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
+
+    def learn_from_batch(self, batch):
+        current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
+
+        selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1)
+
+        # next state values
+        q_st_plus_1_target = self.main_network.target_network.predict(next_states)
+        v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
+
+        # current state values according to online network
+        q_st_online = self.main_network.online_network.predict(current_states)
+
+        # current state values according to target network
+        q_st_target = self.main_network.target_network.predict(current_states)
+        v_st_target = np.max(q_st_target, 1)
+
+        # calculate TD error
+        TD_targets = np.copy(q_st_online)
+        for i in range(self.tp.batch_size):
+            TD_targets[i, actions[i]] = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * \
+                                                     q_st_plus_1_target[i][selected_actions[i]]
+            advantage_learning_update = v_st_target[i] - q_st_target[i, actions[i]]
+            next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
+            # Persistent Advantage Learning or Regular Advantage Learning
+            if self.persistent:
+                TD_targets[i, actions[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
+            else:
+                TD_targets[i, actions[i]] -= self.alpha * advantage_learning_update
+
+            # mixing monte carlo updates
+            monte_carlo_target = total_return[i]
+            TD_targets[i, actions[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, actions[i]] \
+                                        + self.monte_carlo_mixing_rate * monte_carlo_target
+
+        result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+        total_loss = result[0]
+
+        return total_loss
--- a/agents/policy_gradients_agent.py
+++ b/agents/policy_gradients_agent.py
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.policy_optimization_agent import *
+import numpy as np
+from logger import *
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+from utils import *
+
+
+class PolicyGradientsAgent(PolicyOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+        self.last_gradient_update_step_idx = 0
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
+
+        for i in reversed(range(len(total_returns))):
+            if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
+                total_returns[i] = total_returns[0]
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
+                # just take the total return as it is
+                pass
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
+                # we can get a single transition episode while playing Doom Basic, causing the std to be 0
+                if self.std_discounted_return != 0:
+                    total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
+                else:
+                    total_returns[i] = 0
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
+                total_returns[i] -= self.mean_return_over_multiple_episodes[i]
+            else:
+                screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        targets = total_returns
+        if not self.env.discrete_controls and len(actions.shape) < 2:
+            actions = np.expand_dims(actions, -1)
+
+        logger.create_signal_value('Returns Variance', np.std(total_returns), self.task_id)
+        logger.create_signal_value('Returns Mean', np.mean(total_returns), self.task_id)
+
+        result = self.main_network.online_network.accumulate_gradients([current_states, actions], targets)
+        total_loss = result[0]
+
+        return total_loss
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        if self.env.discrete_controls:
+            # DISCRETE
+            action_values = self.main_network.online_network.predict(observation).squeeze()
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_values)
+            else:
+                action = np.argmax(action_values)
+            action_value = {"action_probability": action_values[action]}
+            self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
+        else:
+            # CONTINUOUS
+            result = self.main_network.online_network.predict(observation)
+            action_values = result[0].squeeze()
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_values)
+            else:
+                action = action_values
+            action_value = {}
+
+        return action, action_value
--- a/agents/policy_optimization_agent.py
+++ b/agents/policy_optimization_agent.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.agent import *
+from memories.memory import Episode
+
+
+class PolicyGradientRescaler(Enum):
+    TOTAL_RETURN = 0
+    FUTURE_RETURN = 1
+    FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
+    FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3  # baselined
+    Q_VALUE = 4
+    A_VALUE = 5
+    TD_RESIDUAL = 6
+    DISCOUNTED_TD_RESIDUAL = 7
+    GAE = 8
+
+
+class PolicyOptimizationAgent(Agent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False):
+        Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
+                                           self.replicated_device, self.worker_device)
+        self.networks.append(self.main_network)
+
+        self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler)
+
+        # statistics for variance reduction
+        self.last_gradient_update_step_idx = 0
+        self.max_episode_length = 100000
+        self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
+        self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
+        self.entropy = Signal('Entropy')
+        self.signals.append(self.entropy)
+
+    def log_to_screen(self, phase):
+        # log to screen
+        if self.current_episode > 0:
+            screen.log_dict(
+                OrderedDict([
+                    ("Worker", self.task_id),
+                    ("Episode", self.current_episode),
+                    ("total reward", self.total_reward_in_current_episode),
+                    ("steps", self.total_steps_counter),
+                    ("training iteration", self.training_iteration)
+                ]),
+                prefix="Heatup" if self.in_heatup else "Training" if phase == RunPhase.TRAIN else "Testing"
+            )
+
+    def update_episode_statistics(self, episode):
+        episode_discounted_returns = []
+        for i in range(episode.length()):
+            transition = episode.get_transition(i)
+            episode_discounted_returns.append(transition.total_return)
+            self.num_episodes_where_step_has_been_seen[i] += 1
+            self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
+                                                          self.num_episodes_where_step_has_been_seen[i]
+            self.mean_return_over_multiple_episodes[i] += transition.total_return / \
+                                                          self.num_episodes_where_step_has_been_seen[i]
+        self.mean_discounted_return = np.mean(episode_discounted_returns)
+        self.std_discounted_return = np.std(episode_discounted_returns)
+
+    def train(self):
+        if self.memory.length() == 0:
+            return 0
+
+        episode = self.memory.get_episode(0)
+
+        # check if we should calculate gradients or skip
+        episode_ended = self.memory.num_complete_episodes() >= 1
+        num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
+        is_t_max_steps_passed = num_steps_passed_since_last_update >= self.tp.agent.num_steps_between_gradient_updates
+        if not (is_t_max_steps_passed or episode_ended):
+            return 0
+
+        total_loss = 0
+        if num_steps_passed_since_last_update > 0:
+
+            # we need to update the returns of the episode until now
+            episode.update_returns(self.tp.agent.discount)
+
+            # get t_max transitions or less if the we got to a terminal state
+            # will be used for both actor-critic and vanilla PG.
+            # # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
+            transitions = []
+            start_idx = self.last_gradient_update_step_idx
+            end_idx = episode.length()
+
+            for idx in range(start_idx, end_idx):
+                transitions.append(episode.get_transition(idx))
+            self.last_gradient_update_step_idx = end_idx
+
+            # update the statistics for the variance reduction techniques
+            if self.tp.agent.type == 'PolicyGradientsAgent':
+                self.update_episode_statistics(episode)
+
+            # accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
+            total_loss = self.learn_from_batch(transitions)
+            if self.current_episode % self.tp.agent.apply_gradients_every_x_episodes == 0:
+                self.main_network.apply_gradients_and_sync_networks()
+
+        # move the pointer to the next episode start and discard the episode. we use it only once
+        if episode_ended:
+            self.memory.remove_episode(0)
+            self.last_gradient_update_step_idx = 0
+
+        return total_loss
--- a/agents/ppo_agent.py
+++ b/agents/ppo_agent.py
@@ -0,0 +1,274 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.actor_critic_agent import *
+from random import shuffle
+import tensorflow as tf
+
+
+# Proximal Policy Optimization - https://arxiv.org/pdf/1707.02286.pdf
+class PPOAgent(ActorCriticAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
+                                  create_target_network=True)
+        self.critic_network = self.main_network
+
+        # define the policy network
+        tuning_parameters.agent.input_types = [InputTypes.Observation]
+        tuning_parameters.agent.output_types = [OutputTypes.PPO]
+        tuning_parameters.agent.optimizer_type = 'Adam'
+        tuning_parameters.agent.l2_regularization = 0
+        self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
+                                             self.replicated_device, self.worker_device)
+        self.networks.append(self.policy_network)
+
+        # operations for changing the kl coefficient
+        self.kl_coefficient = tf.placeholder('float', name='kl_coefficient')
+        self.increase_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
+                                                 self.kl_coefficient * 1.5)
+        self.decrease_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
+                                                 self.kl_coefficient / 1.5)
+
+        # signals definition
+        self.value_loss = Signal('Value Loss')
+        self.signals.append(self.value_loss)
+        self.policy_loss = Signal('Policy Loss')
+        self.signals.append(self.policy_loss)
+        self.kl_divergence = Signal('KL Divergence')
+        self.signals.append(self.kl_divergence)
+        self.total_kl_divergence_during_training_process = 0.0
+        self.unclipped_grads = Signal('Grads (unclipped)')
+        self.signals.append(self.unclipped_grads)
+
+    def fill_advantages(self, batch):
+        current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
+
+        # * Found not to have any impact *
+        # current_states_with_timestep = self.concat_state_and_timestep(batch)
+
+        current_state_values = self.critic_network.online_network.predict([current_states]).squeeze()
+
+        # calculate advantages
+        advantages = []
+        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
+            advantages = total_return - current_state_values
+        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
+            # get bootstraps
+            episode_start_idx = 0
+            advantages = np.array([])
+            # current_state_values[game_overs] = 0
+            for idx, game_over in enumerate(game_overs):
+                if game_over:
+                    # get advantages for the rollout
+                    value_bootstrapping = np.zeros((1,))
+                    rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
+
+                    rollout_advantages, _ = \
+                        self.get_general_advantage_estimation_values(rewards[episode_start_idx:idx+1],
+                                                                     rollout_state_values)
+                    episode_start_idx = idx + 1
+                    advantages = np.append(advantages, rollout_advantages)
+        else:
+            screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        # standardize
+        advantages = (advantages - np.mean(advantages)) / np.std(advantages)
+
+        for transition, advantage in zip(self.memory.transitions, advantages):
+            transition.info['advantage'] = advantage
+
+        self.action_advantages.add_sample(advantages)
+
+    def train_value_network(self, dataset, epochs):
+        loss = []
+        current_states, _, _, _, _, total_return = self.extract_batch(dataset)
+
+        # * Found not to have any impact *
+        # add a timestep to the observation
+        # current_states_with_timestep = self.concat_state_and_timestep(dataset)
+
+        total_return = np.expand_dims(total_return, -1)
+        mix_fraction = self.tp.agent.value_targets_mix_fraction
+        for j in range(epochs):
+            batch_size = len(dataset)
+            if self.critic_network.online_network.optimizer_type != 'LBFGS':
+                batch_size = self.tp.batch_size
+            for i in range(len(dataset) // batch_size):
+                # split to batches for first order optimization techniques
+                current_states_batch = current_states[i * batch_size:(i + 1) * batch_size]
+                total_return_batch = total_return[i * batch_size:(i + 1) * batch_size]
+                old_policy_values = force_list(self.critic_network.target_network.predict(
+                    [current_states_batch]).squeeze())
+                if self.critic_network.online_network.optimizer_type != 'LBFGS':
+                    targets = total_return_batch
+                else:
+                    current_values = self.critic_network.online_network.predict([current_states_batch])
+                    targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
+
+                value_loss = self.critic_network.online_network.\
+                    accumulate_gradients([current_states_batch] + old_policy_values, targets)
+                self.critic_network.apply_gradients_to_online_network()
+                if self.tp.distributed:
+                    self.critic_network.apply_gradients_to_global_network()
+                self.critic_network.online_network.reset_accumulated_gradients()
+
+                loss.append([value_loss[0]])
+        loss = np.mean(loss, 0)
+        return loss
+
+    def concat_state_and_timestep(self, dataset):
+        current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
+                                        for transition in dataset]
+        current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
+        return current_states_with_timestep
+
+    def train_policy_network(self, dataset, epochs):
+        loss = []
+        for j in range(epochs):
+            loss = {
+                'total_loss': [],
+                'policy_losses': [],
+                'unclipped_grads': [],
+                'fetch_result': []
+            }
+            #shuffle(dataset)
+            for i in range(len(dataset) // self.tp.batch_size):
+                batch = dataset[i * self.tp.batch_size:(i + 1) * self.tp.batch_size]
+                current_states, _, actions, _, _, total_return = self.extract_batch(batch)
+                advantages = np.array([t.info['advantage'] for t in batch])
+                if not self.tp.env_instance.discrete_controls and len(actions.shape) == 1:
+                    actions = np.expand_dims(actions, -1)
+
+                # get old policy probabilities and distribution
+                old_policy = force_list(self.policy_network.target_network.predict([current_states]))
+
+                # calculate gradients and apply on both the local policy network and on the global policy network
+                fetches = [self.policy_network.online_network.output_heads[0].kl_divergence,
+                           self.policy_network.online_network.output_heads[0].entropy]
+
+                total_loss, policy_losses, unclipped_grads, fetch_result =\
+                    self.policy_network.online_network.accumulate_gradients(
+                        [current_states, actions] + old_policy, [advantages], additional_fetches=fetches)
+
+                self.policy_network.apply_gradients_to_online_network()
+                if self.tp.distributed:
+                    self.policy_network.apply_gradients_to_global_network()
+
+                self.policy_network.online_network.reset_accumulated_gradients()
+
+                loss['total_loss'].append(total_loss)
+                loss['policy_losses'].append(policy_losses)
+                loss['unclipped_grads'].append(unclipped_grads)
+                loss['fetch_result'].append(fetch_result)
+
+                self.unclipped_grads.add_sample(unclipped_grads)
+
+            for key in loss.keys():
+                loss[key] = np.mean(loss[key], 0)
+
+            if self.tp.learning_rate_decay_rate != 0:
+                curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
+                self.curr_learning_rate.add_sample(curr_learning_rate)
+            else:
+                curr_learning_rate = self.tp.learning_rate
+
+            # log training parameters
+            screen.log_dict(
+                OrderedDict([
+                    ("Surrogate loss", loss['policy_losses'][0]),
+                    ("KL divergence", loss['fetch_result'][0]),
+                    ("Entropy", loss['fetch_result'][1]),
+                    ("training epoch", j),
+                    ("learning_rate", curr_learning_rate)
+                ]),
+                prefix="Policy training"
+            )
+
+        self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
+        self.entropy.add_sample(loss['fetch_result'][1])
+        self.kl_divergence.add_sample(loss['fetch_result'][0])
+        return loss['total_loss']
+
+    def update_kl_coefficient(self):
+        # John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
+        # his implementation for now because we know it works well
+        screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
+
+        # update kl coefficient
+        kl_target = self.tp.agent.target_kl_divergence
+        kl_coefficient = self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)
+        if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
+            # kl too high => increase regularization
+            self.tp.sess.run(self.increase_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
+        elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
+            # kl too low => decrease regularization
+            self.tp.sess.run(self.decrease_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
+        screen.log_title("KL penalty coefficient change = {} -> {}".format(
+            kl_coefficient, self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)))
+
+    def post_training_commands(self):
+        if self.tp.agent.use_kl_regularization:
+            self.update_kl_coefficient()
+
+        # clean memory
+        self.memory.clean()
+
+    def train(self):
+        self.policy_network.sync()
+        self.critic_network.sync()
+
+        dataset = self.memory.transitions
+
+        self.fill_advantages(dataset)
+
+        # take only the requested number of steps
+        dataset = dataset[:self.tp.agent.num_consecutive_playing_steps]
+
+        value_loss = self.train_value_network(dataset, 1)
+        policy_loss = self.train_policy_network(dataset, 10)
+
+        self.value_loss.add_sample(value_loss)
+        self.policy_loss.add_sample(policy_loss)
+        self.update_log()  # should be done in order to update the data that has been accumulated * while not playing *
+        return np.append(value_loss, policy_loss)
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = curr_state['observation']
+        observation = np.expand_dims(np.array(observation), 0)
+
+        if self.env.discrete_controls:
+            # DISCRETE
+            action_values = self.policy_network.online_network.predict(observation).squeeze()
+
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_values)
+            else:
+                action = np.argmax(action_values)
+            action_info = {"action_probability": action_values[action]}
+            # self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
+        else:
+            # CONTINUOUS
+            action_values_mean, action_values_std = self.policy_network.online_network.predict(observation)
+            action_values_mean = action_values_mean.squeeze()
+            action_values_std = action_values_std.squeeze()
+            if phase == RunPhase.TRAIN:
+                action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
+            else:
+                action = action_values_mean
+            action_info = {"action_probability": action_values_mean}
+
+        return action, action_info
--- a/agents/value_optimization_agent.py
+++ b/agents/value_optimization_agent.py
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.agent import *
+
+
+class ValueOptimizationAgent(Agent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True):
+        Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+        self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
+                                           self.replicated_device, self.worker_device)
+        self.networks.append(self.main_network)
+        self.q_values = Signal("Q")
+        self.signals.append(self.q_values)
+
+    # Algorithms for which q_values are calculated from predictions will override this function
+    def get_q_values(self, prediction):
+        return prediction
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        if self.tp.agent.use_measurements:
+            measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
+            prediction = self.main_network.online_network.predict([observation, measurements])
+        else:
+            prediction = self.main_network.online_network.predict(observation)
+
+        actions_q_values = self.get_q_values(prediction)
+
+        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
+        if phase == RunPhase.TRAIN:
+            action = self.exploration_policy.get_action(actions_q_values)
+        else:
+            action = self.evaluation_exploration_policy.get_action(actions_q_values)
+
+        # this is for bootstrapped dqn
+        if type(actions_q_values) == list and len(actions_q_values) > 0:
+            actions_q_values = actions_q_values[self.exploration_policy.selected_head]
+        actions_q_values = actions_q_values.squeeze()
+
+        # store the q values statistics for logging
+        self.q_values.add_sample(actions_q_values)
+
+        # store information for plotting interactively (actual plotting is done in agent)
+        if self.tp.visualization.plot_action_values_online:
+            for idx, action_name in enumerate(self.env.actions_description):
+                self.episode_running_info[action_name].append(actions_q_values[idx])
+
+        action_value = {"action_value": actions_q_values[action]}
+        return action, action_value