pre-release 0.10.0

2025-12-18 11:40:18 +01:00 · 2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions
--- a/rl_coach/architectures/tensorflow_components/init.py
+++ b/rl_coach/architectures/tensorflow_components/init.py
--- a/rl_coach/architectures/tensorflow_components/architecture.py
+++ b/rl_coach/architectures/tensorflow_components/architecture.py
@@ -0,0 +1,664 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import time
+from typing import List
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import force_list, squeeze_list
+
+from rl_coach.architectures.architecture import Architecture
+from rl_coach.core_types import GradientClippingMethod
+
+
+def batchnorm_activation_dropout(input_layer, batchnorm, activation_function, dropout, dropout_rate, layer_idx):
+    layers = [input_layer]
+
+    # batchnorm
+    if batchnorm:
+        layers.append(
+            tf.layers.batch_normalization(layers[-1], name="batchnorm{}".format(layer_idx))
+        )
+
+    # activation
+    if activation_function:
+        layers.append(
+            activation_function(layers[-1], name="activation{}".format(layer_idx))
+        )
+
+    # dropout
+    if dropout:
+        layers.append(
+            tf.layers.dropout(layers[-1], dropout_rate, name="dropout{}".format(layer_idx))
+        )
+
+    # remove the input layer from the layers list
+    del layers[0]
+
+    return layers
+
+
+class Conv2d(object):
+    def __init__(self, params: List):
+        """
+        :param params: list of [num_filters, kernel_size, strides]
+        """
+        self.params = params
+
+    def __call__(self, input_layer, name: str):
+        """
+        returns a tensorflow conv2d layer
+        :param input_layer: previous layer
+        :param name: layer name
+        :return: conv2d layer
+        """
+        return tf.layers.conv2d(input_layer, filters=self.params[0], kernel_size=self.params[1], strides=self.params[2],
+                                data_format='channels_last', name=name)
+
+
+class Dense(object):
+    def __init__(self, params: List):
+        """
+        :param params: list of [num_output_neurons]
+        """
+        self.params = params
+
+    def __call__(self, input_layer, name: str):
+        """
+        returns a tensorflow dense layer
+        :param input_layer: previous layer
+        :param name: layer name
+        :return: dense layer
+        """
+        return tf.layers.dense(input_layer, self.params[0], name=name)
+
+
+def variable_summaries(var):
+    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
+    with tf.name_scope('summaries'):
+        layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2]
+
+        with tf.name_scope(layer_weight_name):
+            mean = tf.reduce_mean(var)
+            tf.summary.scalar('mean', mean)
+            with tf.name_scope('stddev'):
+                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
+            tf.summary.scalar('stddev', stddev)
+            tf.summary.scalar('max', tf.reduce_max(var))
+            tf.summary.scalar('min', tf.reduce_min(var))
+            tf.summary.histogram('histogram', var)
+
+
+def local_getter(getter, name, *args, **kwargs):
+    """
+    This is a wrapper around the tf.get_variable function which puts the variables in the local variables collection
+    instead of the global variables collection. The local variables collection will hold variables which are not shared
+    between workers. these variables are also assumed to be non-trainable (the optimizer does not apply gradients to
+    these variables), but we can calculate the gradients wrt these variables, and we can update their content.
+    """
+    kwargs['collections'] = [tf.GraphKeys.LOCAL_VARIABLES]
+    return getter(name, *args, **kwargs)
+
+
+class TensorFlowArchitecture(Architecture):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= "",
+                 global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
+        """
+        :param agent_parameters: the agent parameters
+        :param spaces: the spaces definition of the agent
+        :param name: the name of the network
+        :param global_network: the global network replica that is shared between all the workers
+        :param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
+        :param network_is_trainable: is the network trainable (we can apply gradients on it)
+        """
+        super().__init__(agent_parameters, spaces, name)
+        self.middleware = None
+        self.network_is_local = network_is_local
+        self.global_network = global_network
+        if not self.network_parameters.tensorflow_support:
+            raise ValueError('TensorFlow is not supported for this agent')
+        self.sess = None
+        self.inputs = {}
+        self.outputs = []
+        self.targets = []
+        self.importance_weights = []
+        self.losses = []
+        self.total_loss = None
+        self.trainable_weights = []
+        self.weights_placeholders = []
+        self.shared_accumulated_gradients = []
+        self.curr_rnn_c_in = None
+        self.curr_rnn_h_in = None
+        self.gradients_wrt_inputs = []
+        self.train_writer = None
+        self.accumulated_gradients = None
+        self.network_is_trainable = network_is_trainable
+
+        self.is_chief = self.ap.task_parameters.task_index == 0
+        self.network_is_global = not self.network_is_local and global_network is None
+        self.distributed_training = self.network_is_global or self.network_is_local and global_network is not None
+
+        self.optimizer_type = self.network_parameters.optimizer_type
+        if self.ap.task_parameters.seed is not None:
+            tf.set_random_seed(self.ap.task_parameters.seed)
+        with tf.variable_scope("/".join(self.name.split("/")[1:]), initializer=tf.contrib.layers.xavier_initializer(),
+                               custom_getter=local_getter if network_is_local and global_network else None):
+            self.global_step = tf.train.get_or_create_global_step()
+
+            # build the network
+            self.get_model()
+
+            # model weights
+            self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.full_name)
+
+            # create the placeholder for the assigning gradients and some tensorboard summaries for the weights
+            for idx, var in enumerate(self.weights):
+                placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder')
+                self.weights_placeholders.append(placeholder)
+                if self.ap.visualization.tensorboard:
+                    variable_summaries(var)
+
+            # create op for assigning a list of weights to the network weights
+            self.update_weights_from_list = [weights.assign(holder) for holder, weights in
+                                             zip(self.weights_placeholders, self.weights)]
+
+            # locks for synchronous training
+            if self.network_is_global:
+                self._create_locks_for_synchronous_training()
+
+            # gradients ops
+            self._create_gradient_ops()
+
+            # L2 regularization
+            if self.network_parameters.l2_regularization != 0:
+                self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.weights])
+                                          * self.network_parameters.l2_regularization]
+                tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization)
+
+            self.inc_step = self.global_step.assign_add(1)
+
+            # reset LSTM hidden cells
+            self.reset_internal_memory()
+
+            if self.ap.visualization.tensorboard:
+                current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
+                                                            scope=tf.contrib.framework.get_name_scope())
+                self.merged = tf.summary.merge(current_scope_summaries)
+
+            # initialize or restore model
+            self.init_op = tf.group(
+                tf.global_variables_initializer(),
+                tf.local_variables_initializer()
+            )
+
+            # set the fetches for training
+            self._set_initial_fetch_list()
+
+    def _set_initial_fetch_list(self):
+        """
+        Create an initial list of tensors to fetch in each training iteration
+        :return: None
+        """
+        self.train_fetches = [self.gradients_norm]
+        if self.network_parameters.clip_gradients:
+            self.train_fetches.append(self.clipped_grads)
+        else:
+            self.train_fetches.append(self.tensor_gradients)
+        self.train_fetches += [self.total_loss, self.losses]
+        if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+            self.train_fetches.append(self.middleware.state_out)
+        self.additional_fetches_start_idx = len(self.train_fetches)
+
+    def _create_locks_for_synchronous_training(self):
+        """
+        Create locks for synchronizing the different workers during training
+        :return: None
+        """
+        self.lock_counter = tf.get_variable("lock_counter", [], tf.int32,
+                                            initializer=tf.constant_initializer(0, dtype=tf.int32),
+                                            trainable=False)
+        self.lock = self.lock_counter.assign_add(1, use_locking=True)
+        self.lock_init = self.lock_counter.assign(0)
+
+        self.release_counter = tf.get_variable("release_counter", [], tf.int32,
+                                               initializer=tf.constant_initializer(0, dtype=tf.int32),
+                                               trainable=False)
+        self.release = self.release_counter.assign_add(1, use_locking=True)
+        self.release_decrement = self.release_counter.assign_add(-1, use_locking=True)
+        self.release_init = self.release_counter.assign(0)
+
+    def _create_gradient_ops(self):
+        """
+        Create all the tensorflow operations for calculating gradients, processing the gradients and applying them
+        :return: None
+        """
+
+        self.tensor_gradients = tf.gradients(self.total_loss, self.weights)
+        self.gradients_norm = tf.global_norm(self.tensor_gradients)
+
+        # gradient clipping
+        if self.network_parameters.clip_gradients is not None and self.network_parameters.clip_gradients != 0:
+            self._create_gradient_clipping_ops()
+
+        # when using a shared optimizer, we create accumulators to store gradients from all the workers before
+        # applying them
+        if self.distributed_training:
+            self._create_gradient_accumulators()
+
+        # gradients of the outputs w.r.t. the inputs
+        # at the moment, this is only used by ddpg
+        self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
+                                      self.inputs.items()} for output in self.outputs]
+        self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
+                                     for i in range(len(self.outputs))]
+        self.weighted_gradients = []
+        for i in range(len(self.outputs)):
+            unnormalized_gradients = tf.gradients(self.outputs[i], self.weights, self.gradients_weights_ph[i])
+            # unnormalized gradients seems to be better at the time. TODO: validate this accross more environments
+            # self.weighted_gradients.append(list(map(lambda x: tf.div(x, self.network_parameters.batch_size),
+            #                                         unnormalized_gradients)))
+            self.weighted_gradients.append(unnormalized_gradients)
+
+        # defining the optimization process (for LBFGS we have less control over the optimizer)
+        if self.optimizer_type != 'LBFGS' and self.network_is_trainable:
+            self._create_gradient_applying_ops()
+
+    def _create_gradient_accumulators(self):
+        if self.network_is_global:
+            self.shared_accumulated_gradients = [tf.Variable(initial_value=tf.zeros_like(var)) for var in self.weights]
+            self.accumulate_shared_gradients = [var.assign_add(holder, use_locking=True) for holder, var in
+                                                zip(self.weights_placeholders, self.shared_accumulated_gradients)]
+            self.init_shared_accumulated_gradients = [var.assign(tf.zeros_like(var)) for var in
+                                                      self.shared_accumulated_gradients]
+        elif self.network_is_local:
+            self.accumulate_shared_gradients = self.global_network.accumulate_shared_gradients
+            self.init_shared_accumulated_gradients = self.global_network.init_shared_accumulated_gradients
+
+    def _create_gradient_clipping_ops(self):
+        """
+        Create tensorflow ops for clipping the gradients according to the given GradientClippingMethod
+        :return: None
+        """
+        if self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByGlobalNorm:
+            self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients,
+                                                                         self.network_parameters.clip_gradients)
+        elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByValue:
+            self.clipped_grads = [tf.clip_by_value(grad,
+                                                   -self.network_parameters.clip_gradients,
+                                                   self.network_parameters.clip_gradients)
+                                  for grad in self.tensor_gradients]
+        elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByNorm:
+            self.clipped_grads = [tf.clip_by_norm(grad, self.network_parameters.clip_gradients)
+                                  for grad in self.tensor_gradients]
+
+    def _create_gradient_applying_ops(self):
+        """
+        Create tensorflow ops for applying the gradients to the network weights according to the training scheme
+        (distributed training - local or global network, shared optimizer, etc.)
+        :return: None
+        """
+        if self.network_is_global and self.network_parameters.shared_optimizer and \
+                not self.network_parameters.async_training:
+            # synchronous training with shared optimizer? -> create an operation for applying the gradients
+            # accumulated in the shared gradients accumulator
+            self.update_weights_from_shared_gradients = self.optimizer.apply_gradients(
+                zip(self.shared_accumulated_gradients, self.weights),
+                global_step=self.global_step)
+
+        elif self.distributed_training and self.network_is_local:
+            # distributed training but independent optimizer? -> create an operation for applying the gradients
+            # to the global weights
+            self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
+                zip(self.weights_placeholders, self.global_network.weights), global_step=self.global_step)
+
+        elif self.network_is_trainable:
+            # not any of the above but is trainable? -> create an operation for applying the gradients to
+            # this network weights
+            self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
+                zip(self.weights_placeholders, self.weights), global_step=self.global_step)
+
+    def set_session(self, sess):
+        self.sess = sess
+
+        task_is_distributed = isinstance(self.ap.task_parameters, DistributedTaskParameters)
+        # initialize the session parameters in single threaded runs. Otherwise, this is done through the
+        # MonitoredSession object in the graph manager
+        if not task_is_distributed:
+            self.sess.run(self.init_op)
+
+        if self.ap.visualization.tensorboard:
+            # Write the merged summaries to the current experiment directory
+            if not task_is_distributed:
+                self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path + '/tensorboard')
+                self.train_writer.add_graph(self.sess.graph)
+            elif self.network_is_local:
+                self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path +
+                                                          '/tensorboard/worker{}'.format(self.ap.task_parameters.task_index))
+                self.train_writer.add_graph(self.sess.graph)
+
+        # wait for all the workers to set their session
+        if not self.network_is_local:
+            self.wait_for_all_workers_barrier()
+
+    def reset_accumulated_gradients(self):
+        """
+        Reset the gradients accumulation placeholder
+        """
+        if self.accumulated_gradients is None:
+            self.accumulated_gradients = self.sess.run(self.weights)
+
+        for ix, grad in enumerate(self.accumulated_gradients):
+            self.accumulated_gradients[ix] = grad * 0
+
+    def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
+                             no_accumulation=False):
+        """
+        Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
+        placeholders
+        :param additional_fetches: Optional tensors to fetch during gradients calculation
+        :param inputs: The input batch for the network
+        :param targets: The targets corresponding to the input batch
+        :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
+                                   error of this sample. If it is not given, the samples losses won't be scaled
+        :param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
+                                replaced by the newely calculated gradients instead of accumulating the new gradients.
+                                This can speed up the function runtime by around 10%.
+        :return: A list containing the total loss and the individual network heads losses
+        """
+
+        if self.accumulated_gradients is None:
+            self.reset_accumulated_gradients()
+
+        # feed inputs
+        if additional_fetches is None:
+            additional_fetches = []
+        feed_dict = self.create_feed_dict(inputs)
+
+        # feed targets
+        targets = force_list(targets)
+        for placeholder_idx, target in enumerate(targets):
+            feed_dict[self.targets[placeholder_idx]] = target
+
+        # feed importance weights
+        importance_weights = force_list(importance_weights)
+        for placeholder_idx, target_ph in enumerate(targets):
+            if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
+                importance_weight = np.ones(target_ph.shape[0])
+            else:
+                importance_weight = importance_weights[placeholder_idx]
+            importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
+
+            feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
+
+        if self.optimizer_type != 'LBFGS':
+
+            # feed the lstm state if necessary
+            if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+                # we can't always assume that we are starting from scratch here can we?
+                feed_dict[self.middleware.c_in] = self.middleware.c_init
+                feed_dict[self.middleware.h_in] = self.middleware.h_init
+
+            fetches = self.train_fetches + additional_fetches
+            if self.ap.visualization.tensorboard:
+                fetches += [self.merged]
+
+            # get grads
+            result = self.sess.run(fetches, feed_dict=feed_dict)
+            if hasattr(self, 'train_writer') and self.train_writer is not None:
+                self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))
+
+            # extract the fetches
+            norm_unclipped_grads, grads, total_loss, losses = result[:4]
+            if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+                (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
+            fetched_tensors = []
+            if len(additional_fetches) > 0:
+                fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
+                                                                      len(additional_fetches)]
+
+            # accumulate the gradients
+            for idx, grad in enumerate(grads):
+                if no_accumulation:
+                    self.accumulated_gradients[idx] = grad
+                else:
+                    self.accumulated_gradients[idx] += grad
+
+            return total_loss, losses, norm_unclipped_grads, fetched_tensors
+
+        else:
+            self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)
+
+            return [0]
+
+    def create_feed_dict(self, inputs):
+        feed_dict = {}
+        for input_name, input_value in inputs.items():
+            if isinstance(input_name, str):
+                if input_name not in self.inputs:
+                    raise ValueError((
+                        'input name {input_name} was provided to create a feed '
+                        'dictionary, but there is no placeholder with that name. '
+                        'placeholder names available include: {placeholder_names}'
+                    ).format(
+                        input_name=input_name,
+                        placeholder_names=', '.join(self.inputs.keys())
+                    ))
+
+                feed_dict[self.inputs[input_name]] = input_value
+            elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder':
+                feed_dict[input_name] = input_value
+            else:
+                raise ValueError((
+                    'input dictionary expects strings or placeholders as keys, '
+                    'but found key {key} of type {type}'
+                ).format(
+                    key=input_name,
+                    type=type(input_name),
+                ))
+
+        return feed_dict
+
+    def apply_and_reset_gradients(self, gradients, scaler=1.):
+        """
+        Applies the given gradients to the network weights and resets the accumulation placeholder
+        :param gradients: The gradients to use for the update
+        :param scaler: A scaling factor that allows rescaling the gradients before applying them
+        """
+        self.apply_gradients(gradients, scaler)
+        self.reset_accumulated_gradients()
+
+    def wait_for_all_workers_to_lock(self, lock: str, include_only_training_workers: bool=False):
+        """
+        Waits for all the workers to lock a certain lock and then continues
+        :param lock: the name of the lock to use
+        :param include_only_training_workers: wait only for training workers or for all the workers?
+        :return: None
+        """
+        if include_only_training_workers:
+            num_workers_to_wait_for = self.ap.task_parameters.num_training_tasks
+        else:
+            num_workers_to_wait_for = self.ap.task_parameters.num_tasks
+
+        # lock
+        if hasattr(self, '{}_counter'.format(lock)):
+            self.sess.run(getattr(self, lock))
+            while self.sess.run(getattr(self, '{}_counter'.format(lock))) % num_workers_to_wait_for != 0:
+                time.sleep(0.00001)
+            # self.sess.run(getattr(self, '{}_init'.format(lock)))
+        else:
+            raise ValueError("no counter was defined for the lock {}".format(lock))
+
+    def wait_for_all_workers_barrier(self, include_only_training_workers: bool=False):
+        """
+        A barrier that allows waiting for all the workers to finish a certain block of commands
+        :param include_only_training_workers: wait only for training workers or for all the workers?
+        :return: None
+        """
+        self.wait_for_all_workers_to_lock('lock', include_only_training_workers=include_only_training_workers)
+        self.sess.run(self.lock_init)
+
+        # we need to lock again (on a different lock) in order to prevent a situation where one of the workers continue
+        # and then was able to first increase the lock again by one, only to have a late worker to reset it again.
+        # so we want to make sure that all workers are done resetting the lock before continuting to reuse that lock.
+
+        self.wait_for_all_workers_to_lock('release', include_only_training_workers=include_only_training_workers)
+        self.sess.run(self.release_init)
+
+    def apply_gradients(self, gradients, scaler=1.):
+        """
+        Applies the given gradients to the network weights
+        :param gradients: The gradients to use for the update
+        :param scaler: A scaling factor that allows rescaling the gradients before applying them.
+                       The gradients will be MULTIPLIED by this factor
+        """
+        if self.network_parameters.async_training or not isinstance(self.ap.task_parameters, DistributedTaskParameters):
+            if hasattr(self, 'global_step') and not self.network_is_local:
+                self.sess.run(self.inc_step)
+
+        if self.optimizer_type != 'LBFGS':
+
+            if self.distributed_training and not self.network_parameters.async_training:
+                # rescale the gradients so that they average out with the gradients from the other workers
+                if self.network_parameters.scale_down_gradients_by_number_of_workers_for_sync_training:
+                    scaler /= float(self.ap.task_parameters.num_training_tasks)
+
+            # rescale the gradients
+            if scaler != 1.:
+                for gradient in gradients:
+                    gradient *= scaler
+
+            # apply the gradients
+            feed_dict = dict(zip(self.weights_placeholders, gradients))
+            if self.distributed_training and self.network_parameters.shared_optimizer \
+                    and not self.network_parameters.async_training:
+                # synchronous distributed training with shared optimizer:
+                # - each worker adds its gradients to the shared gradients accumulators
+                # - we wait for all the workers to add their gradients
+                # - the chief worker (worker with task index = 0) applies the gradients once and resets the accumulators
+
+                self.sess.run(self.accumulate_shared_gradients, feed_dict=feed_dict)
+
+                self.wait_for_all_workers_barrier(include_only_training_workers=True)
+
+                if self.is_chief:
+                    self.sess.run(self.update_weights_from_shared_gradients)
+                    self.sess.run(self.init_shared_accumulated_gradients)
+            else:
+                # async distributed training / distributed training with independent optimizer
+                #  / non-distributed training - just apply the gradients
+                feed_dict = dict(zip(self.weights_placeholders, gradients))
+                self.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict)
+
+            # release barrier
+            if self.distributed_training and not self.network_parameters.async_training:
+                self.wait_for_all_workers_barrier(include_only_training_workers=True)
+
+    def predict(self, inputs, outputs=None, squeeze_output=True, initial_feed_dict=None):
+        """
+        Run a forward pass of the network using the given input
+        :param inputs: The input for the network
+        :param outputs: The output for the network, defaults to self.outputs
+        :param squeeze_output: call squeeze_list on output
+        :param initial_feed_dict: a dictionary to use as the initial feed_dict. other inputs will be added to this dict
+        :return: The network output
+
+        WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
+        """
+        feed_dict = self.create_feed_dict(inputs)
+        if initial_feed_dict:
+            feed_dict.update(initial_feed_dict)
+        if outputs is None:
+            outputs = self.outputs
+
+        if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+            feed_dict[self.middleware.c_in] = self.curr_rnn_c_in
+            feed_dict[self.middleware.h_in] = self.curr_rnn_h_in
+
+            output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.sess.run([outputs, self.middleware.state_out],
+                                                                             feed_dict=feed_dict)
+        else:
+            output = self.sess.run(outputs, feed_dict)
+
+        if squeeze_output:
+            output = squeeze_list(output)
+        return output
+
+    def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None):
+        """
+        Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
+        :param additional_fetches: Optional tensors to fetch during the training process
+        :param inputs: The input for the network
+        :param targets: The targets corresponding to the input batch
+        :param scaler: A scaling factor that allows rescaling the gradients before applying them
+        :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
+                                   error of this sample. If it is not given, the samples losses won't be scaled
+        :return: The loss of the network
+        """
+        if additional_fetches is None:
+            additional_fetches = []
+        force_list(additional_fetches)
+        loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
+                                         importance_weights=importance_weights)
+        self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
+        return loss
+
+    def get_weights(self):
+        """
+        :return: a list of tensors containing the network weights for each layer
+        """
+        return self.weights
+
+    def set_weights(self, weights, new_rate=1.0):
+        """
+        Sets the network weights from the given list of weights tensors
+        """
+        feed_dict = {}
+        old_weights, new_weights = self.sess.run([self.get_weights(), weights])
+        for placeholder_idx, new_weight in enumerate(new_weights):
+            feed_dict[self.weights_placeholders[placeholder_idx]]\
+                = new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx]
+        self.sess.run(self.update_weights_from_list, feed_dict)
+
+    def get_variable_value(self, variable):
+        """
+        Get the value of a variable from the graph
+        :param variable: the variable
+        :return: the value of the variable
+        """
+        return self.sess.run(variable)
+
+    def set_variable_value(self, assign_op, value, placeholder=None):
+        """
+        Updates the value of a variable.
+        This requires having an assign operation for the variable, and a placeholder which will provide the value
+        :param assign_op: an assign operation for the variable
+        :param value: a value to set the variable to
+        :param placeholder: a placeholder to hold the given value for injecting it into the variable
+        """
+        self.sess.run(assign_op, feed_dict={placeholder: value})
+
+    def reset_internal_memory(self):
+        """
+        Reset any internal memory used by the network. For example, an LSTM internal state
+        :return: None
+        """
+        # initialize LSTM hidden states
+        if self.middleware.__class__.__name__ == 'LSTMMiddleware':
+            self.curr_rnn_c_in = self.middleware.c_init
+            self.curr_rnn_h_in = self.middleware.h_init
--- a/rl_coach/architectures/tensorflow_components/distributed_tf_utils.py
+++ b/rl_coach/architectures/tensorflow_components/distributed_tf_utils.py
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Tuple
+
+import tensorflow as tf
+
+
+def create_cluster_spec(parameters_server: str, workers: str) -> tf.train.ClusterSpec:
+    """
+    Creates a ClusterSpec object representing the cluster.
+    :param parameters_server: comma-separated list of hostname:port pairs to which the parameter servers are assigned
+    :param workers: comma-separated list of hostname:port pairs to which the workers are assigned
+    :return: a ClusterSpec object representing the cluster
+    """
+    # extract the parameter servers and workers from the given strings
+    ps_hosts = parameters_server.split(",")
+    worker_hosts = workers.split(",")
+
+    # Create a cluster spec from the parameter server and worker hosts
+    cluster_spec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
+
+    return cluster_spec
+
+
+def create_and_start_parameters_server(cluster_spec: tf.train.ClusterSpec, config: tf.ConfigProto=None) -> None:
+    """
+    Create and start a parameter server
+    :param cluster_spec: the ClusterSpec object representing the cluster
+    :param config: the tensorflow config to use
+    :return: None
+    """
+    # create a server object for the parameter server
+    server = tf.train.Server(cluster_spec, job_name="ps", task_index=0, config=config)
+
+    # wait for the server to finish
+    server.join()
+
+
+def create_worker_server_and_device(cluster_spec: tf.train.ClusterSpec, task_index: int,
+                                    use_cpu: bool=True, config: tf.ConfigProto=None) -> Tuple[str, tf.device]:
+    """
+    Creates a worker server and a device setter used to assign the workers operations to
+    :param cluster_spec: a ClusterSpec object representing the cluster
+    :param task_index: the index of the worker task
+    :param use_cpu: if use_cpu=True, all the agent operations will be assigned to a CPU instead of a GPU
+    :param config: the tensorflow config to use
+    :return: the target string for the tf.Session and the worker device setter object
+    """
+    # Create and start a worker
+    server = tf.train.Server(cluster_spec, job_name="worker", task_index=task_index, config=config)
+
+    # Assign ops to the local worker
+    worker_device = "/job:worker/task:{}".format(task_index)
+    if use_cpu:
+        worker_device += "/cpu:0"
+    else:
+        worker_device += "/device:GPU:0"
+    device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec)
+
+    return server.target, device
+
+
+def create_monitored_session(target: tf.train.Server, task_index: int,
+                             checkpoint_dir: str, save_checkpoint_secs: int, config: tf.ConfigProto=None) -> tf.Session:
+    """
+    Create a monitored session for the worker
+    :param target: the target string for the tf.Session
+    :param task_index: the task index of the worker
+    :param checkpoint_dir: a directory path where the checkpoints will be stored
+    :param save_checkpoint_secs: number of seconds between checkpoints storing
+    :param config: the tensorflow configuration (optional)
+    :return: the session to use for the run
+    """
+    # we chose the first task to be the chief
+    is_chief = task_index == 0
+
+    # Create the monitored session
+    sess = tf.train.MonitoredTrainingSession(
+        master=target,
+        is_chief=is_chief,
+        hooks=[],
+        checkpoint_dir=checkpoint_dir,
+        save_checkpoint_secs=save_checkpoint_secs,
+        config=config
+    )
+
+    return sess
+
--- a/rl_coach/architectures/tensorflow_components/embedders/init.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/init.py
--- a/rl_coach/architectures/tensorflow_components/embedders/embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/embedder.py
@@ -0,0 +1,114 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Union
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.shared_variables import SharedRunningStats
+from rl_coach.base_parameters import EmbedderScheme
+
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
+from rl_coach.core_types import InputEmbedding
+
+
+class InputEmbedder(object):
+    """
+    An input embedder is the first part of the network, which takes the input from the state and produces a vector
+    embedding by passing it through a neural network. The embedder will mostly be input type dependent, and there
+    can be multiple embedders in a single network
+    """
+    def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
+                 scheme: EmbedderScheme=None, batchnorm: bool=False, dropout: bool=False,
+                 name: str= "embedder", input_rescaling=1.0, input_offset=0.0, input_clipping=None):
+        self.name = name
+        self.input_size = input_size
+        self.activation_function = activation_function
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+        self.dropout_rate = 0
+        self.input = None
+        self.output = None
+        self.scheme = scheme
+        self.return_type = InputEmbedding
+        self.layers = []
+        self.input_rescaling = input_rescaling
+        self.input_offset = input_offset
+        self.input_clipping = input_clipping
+
+    def __call__(self, prev_input_placeholder=None):
+        with tf.variable_scope(self.get_name()):
+            if prev_input_placeholder is None:
+                self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
+            else:
+                self.input = prev_input_placeholder
+            self._build_module()
+
+        return self.input, self.output
+
+    def _build_module(self):
+        # NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
+        #  to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
+        #  input to the network to be float, which is 4x more expensive in memory.
+        #  thus causing each saved transition in the memory to also be 4x more pricier.
+
+        input_layer = self.input / self.input_rescaling
+        input_layer -= self.input_offset
+        # clip input using te given range
+        if self.input_clipping is not None:
+            input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1])
+
+        self.layers.append(input_layer)
+
+        # layers order is conv -> batchnorm -> activation -> dropout
+        if isinstance(self.scheme, EmbedderScheme):
+            layers_params = self.schemes[self.scheme]
+        else:
+            layers_params = self.scheme
+        for idx, layer_params in enumerate(layers_params):
+            self.layers.append(
+                layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
+            )
+
+            self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
+                                                            self.activation_function, self.dropout,
+                                                            self.dropout_rate, idx))
+
+        self.output = tf.contrib.layers.flatten(self.layers[-1])
+
+    @property
+    def input_size(self) -> List[int]:
+        return self._input_size
+
+    @input_size.setter
+    def input_size(self, value: Union[int, List[int]]):
+        if isinstance(value, np.ndarray) or isinstance(value, tuple):
+            value = list(value)
+        elif isinstance(value, int):
+            value = [value]
+        if not isinstance(value, list):
+            raise ValueError((
+                'input_size expected to be a list, found {value} which has type {type}'
+            ).format(value=value, type=type(value)))
+        self._input_size = value
+
+    @property
+    def schemes(self):
+        raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
+                                  "configurations.")
+
+    def get_name(self):
+        return self.name
--- a/rl_coach/architectures/tensorflow_components/embedders/image_embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/image_embedder.py
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.architecture import Conv2d
+from rl_coach.base_parameters import EmbedderScheme
+
+from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
+from rl_coach.core_types import InputImageEmbedding
+
+
+class ImageEmbedder(InputEmbedder):
+    """
+    An input embedder that performs convolutions on the input and then flattens the result.
+    The embedder is intended for image like inputs, where the channels are expected to be the last axis.
+    The embedder also allows custom rescaling of the input prior to the neural network.
+    """
+    schemes = {
+        EmbedderScheme.Empty:
+            [],
+
+        EmbedderScheme.Shallow:
+            [
+                Conv2d([32, 3, 1])
+            ],
+
+        # atari dqn
+        EmbedderScheme.Medium:
+            [
+                Conv2d([32, 8, 4]),
+                Conv2d([64, 4, 2]),
+                Conv2d([64, 3, 1])
+            ],
+
+        # carla
+        EmbedderScheme.Deep: \
+            [
+                Conv2d([32, 5, 2]),
+                Conv2d([32, 3, 1]),
+                Conv2d([64, 3, 2]),
+                Conv2d([64, 3, 1]),
+                Conv2d([128, 3, 2]),
+                Conv2d([128, 3, 1]),
+                Conv2d([256, 3, 2]),
+                Conv2d([256, 3, 1])
+            ]
+    }
+
+    def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
+                 scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
+                 name: str= "embedder", input_rescaling: float=255.0, input_offset: float=0.0, input_clipping=None):
+        super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling,
+                         input_offset, input_clipping)
+        self.return_type = InputImageEmbedding
+        if len(input_size) != 3 and scheme != EmbedderScheme.Empty:
+            raise ValueError("Image embedders expect the input size to have 3 dimensions. The given size is: {}"
+                             .format(input_size))
+
+
--- a/rl_coach/architectures/tensorflow_components/embedders/vector_embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/vector_embedder.py
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.architecture import Dense
+from rl_coach.base_parameters import EmbedderScheme
+
+from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
+from rl_coach.core_types import InputVectorEmbedding
+
+
+class VectorEmbedder(InputEmbedder):
+    """
+    An input embedder that is intended for inputs that can be represented as vectors.
+    The embedder flattens the input, applies several dense layers to it and returns the output.
+    """
+    schemes = {
+        EmbedderScheme.Empty:
+            [],
+
+        EmbedderScheme.Shallow:
+            [
+                Dense([128])
+            ],
+
+        # dqn
+        EmbedderScheme.Medium:
+            [
+                Dense([256])
+            ],
+
+        # carla
+        EmbedderScheme.Deep: \
+            [
+                Dense([128]),
+                Dense([128]),
+                Dense([128])
+            ]
+    }
+
+    def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
+                 scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
+                 name: str= "embedder", input_rescaling: float=1.0, input_offset:float=0.0, input_clipping=None):
+        super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name,
+                         input_rescaling, input_offset, input_clipping)
+
+        self.return_type = InputVectorEmbedding
+        if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty:
+            raise ValueError("The input size of a vector embedder must contain only a single dimension")
--- a/rl_coach/architectures/tensorflow_components/general_network.py
+++ b/rl_coach/architectures/tensorflow_components/general_network.py
@@ -0,0 +1,344 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from typing import Dict
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
+from rl_coach.architectures.tensorflow_components.middlewares.middleware import MiddlewareParameters
+from rl_coach.base_parameters import AgentParameters, InputEmbedderParameters, EmbeddingMergerType
+from rl_coach.spaces import SpacesDefinition, PlanarMapsObservationSpace
+from rl_coach.utils import get_all_subclasses, dynamic_import_and_instantiate_module_from_params
+
+from rl_coach.architectures.tensorflow_components.architecture import TensorFlowArchitecture
+from rl_coach.core_types import PredictionType
+
+
+class GeneralTensorFlowNetwork(TensorFlowArchitecture):
+    """
+    A generalized version of all possible networks implemented using tensorflow.
+    """
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str,
+                 global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
+        """
+        :param agent_parameters: the agent parameters
+        :param spaces: the spaces definition of the agent
+        :param name: the name of the network
+        :param global_network: the global network replica that is shared between all the workers
+        :param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
+        :param network_is_trainable: is the network trainable (we can apply gradients on it)
+        """
+        self.global_network = global_network
+        self.network_is_local = network_is_local
+        self.network_wrapper_name = name.split('/')[0]
+        self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
+        self.num_heads_per_network = 1 if self.network_parameters.use_separate_networks_per_head else \
+            len(self.network_parameters.heads_parameters)
+        self.num_networks = 1 if not self.network_parameters.use_separate_networks_per_head else \
+            len(self.network_parameters.heads_parameters)
+
+        self.gradients_from_head_rescalers = []
+        self.gradients_from_head_rescalers_placeholders = []
+        self.update_head_rescaler_value_ops = []
+
+        self.adaptive_learning_rate_scheme = None
+        self.current_learning_rate = None
+
+        # init network modules containers
+        self.input_embedders = []
+        self.output_heads = []
+        super().__init__(agent_parameters, spaces, name, global_network,
+                         network_is_local, network_is_trainable)
+
+        def fill_return_types():
+            ret_dict = {}
+            for cls in get_all_subclasses(PredictionType):
+                ret_dict[cls] = []
+            components = self.input_embedders + [self.middleware] + self.output_heads
+            for component in components:
+                if not hasattr(component, 'return_type'):
+                    raise ValueError("{} has no return_type attribute. This should not happen.")
+                if component.return_type is not None:
+                    ret_dict[component.return_type].append(component)
+
+            return ret_dict
+
+        self.available_return_types = fill_return_types()
+
+    def predict_with_prediction_type(self, states: Dict[str, np.ndarray],
+                                     prediction_type: PredictionType) -> Dict[str, np.ndarray]:
+        """
+        Search for a component[s] which has a return_type set to the to the requested PredictionType, and get
+        predictions for it.
+
+        :param states: The input states to the network.
+        :param prediction_type: The requested PredictionType to look for in the network components
+        :return: A dictionary with predictions for all components matching the requested prediction type
+        """
+
+        ret_dict = {}
+        for component in self.available_return_types[prediction_type]:
+            ret_dict[component] = self.predict(inputs=states, outputs=component.output)
+
+        return ret_dict
+
+    @staticmethod
+    def get_activation_function(activation_function_string: str):
+        """
+        Map the activation function from a string to the tensorflow framework equivalent
+        :param activation_function_string: the type of the activation function
+        :return: the tensorflow activation function
+        """
+        activation_functions = {
+            'relu': tf.nn.relu,
+            'tanh': tf.nn.tanh,
+            'sigmoid': tf.nn.sigmoid,
+            'elu': tf.nn.elu,
+            'selu': tf.nn.selu,
+            'leaky_relu': tf.nn.leaky_relu,
+            'none': None
+        }
+        assert activation_function_string in activation_functions.keys(), \
+            "Activation function must be one of the following {}. instead it was: {}"\
+                .format(activation_functions.keys(), activation_function_string)
+        return activation_functions[activation_function_string]
+
+    def get_input_embedder(self, input_name: str, embedder_params: InputEmbedderParameters):
+        """
+        Given an input embedder parameters class, creates the input embedder and returns it
+        :param input_name: the name of the input to the embedder (used for retrieving the shape). The input should
+                           be a value within the state or the action.
+        :param embedder_params: the parameters of the class of the embedder
+        :return: the embedder instance
+        """
+        allowed_inputs = copy.copy(self.spaces.state.sub_spaces)
+        allowed_inputs["action"] = copy.copy(self.spaces.action)
+        allowed_inputs["goal"] = copy.copy(self.spaces.goal)
+
+        if input_name not in allowed_inputs.keys():
+            raise ValueError("The key for the input embedder ({}) must match one of the following keys: {}"
+                             .format(input_name, allowed_inputs.keys()))
+
+        type = "vector"
+        if isinstance(allowed_inputs[input_name], PlanarMapsObservationSpace):
+            type = "image"
+
+        embedder_path = 'rl_coach.architectures.tensorflow_components.embedders.' + embedder_params.path[type]
+        embedder_params_copy = copy.copy(embedder_params)
+        embedder_params_copy.activation_function = self.get_activation_function(embedder_params.activation_function)
+        embedder_params_copy.input_rescaling = embedder_params_copy.input_rescaling[type]
+        embedder_params_copy.input_offset = embedder_params_copy.input_offset[type]
+        embedder_params_copy.name = input_name
+        module = dynamic_import_and_instantiate_module_from_params(embedder_params_copy,
+                                                                   path=embedder_path,
+                                                                   positional_args=[allowed_inputs[input_name].shape])
+        return module
+
+    def get_middleware(self, middleware_params: MiddlewareParameters):
+        """
+        Given a middleware type, creates the middleware and returns it
+        :param middleware_params: the paramaeters of the middleware class
+        :return: the middleware instance
+        """
+        middleware_params_copy = copy.copy(middleware_params)
+        middleware_params_copy.activation_function = self.get_activation_function(middleware_params.activation_function)
+        module = dynamic_import_and_instantiate_module_from_params(middleware_params_copy)
+        return module
+
+    def get_output_head(self, head_params: HeadParameters, head_idx: int, loss_weight: float=1.):
+        """
+        Given a head type, creates the head and returns it
+        :param head_params: the parameters of the head to create
+        :param head_type: the path to the class of the head under the embedders directory or a full path to a head class.
+                          the path should be in the following structure: <module_path>:<class_path>
+        :param head_idx: the head index
+        :param loss_weight: the weight to assign for the embedders loss
+        :return: the head
+        """
+
+        head_params_copy = copy.copy(head_params)
+        head_params_copy.activation_function = self.get_activation_function(head_params_copy.activation_function)
+        return dynamic_import_and_instantiate_module_from_params(head_params_copy, extra_kwargs={
+            'agent_parameters': self.ap, 'spaces': self.spaces, 'network_name': self.network_wrapper_name,
+            'head_idx': head_idx, 'loss_weight': loss_weight, 'is_local': self.network_is_local})
+
+    def get_model(self):
+        # validate the configuration
+        if len(self.network_parameters.input_embedders_parameters) == 0:
+            raise ValueError("At least one input type should be defined")
+
+        if len(self.network_parameters.heads_parameters) == 0:
+            raise ValueError("At least one output type should be defined")
+
+        if self.network_parameters.middleware_parameters is None:
+            raise ValueError("Exactly one middleware type should be defined")
+
+        if len(self.network_parameters.loss_weights) == 0:
+            raise ValueError("At least one loss weight should be defined")
+
+        if len(self.network_parameters.heads_parameters) != len(self.network_parameters.loss_weights):
+            raise ValueError("Number of loss weights should match the number of output types")
+
+        for network_idx in range(self.num_networks):
+            with tf.variable_scope('network_{}'.format(network_idx)):
+
+                ####################
+                # Input Embeddings #
+                ####################
+
+                state_embedding = []
+                for input_name in sorted(self.network_parameters.input_embedders_parameters):
+                    input_type = self.network_parameters.input_embedders_parameters[input_name]
+                    # get the class of the input embedder
+                    input_embedder = self.get_input_embedder(input_name, input_type)
+                    self.input_embedders.append(input_embedder)
+
+                    # input placeholders are reused between networks. on the first network, store the placeholders
+                    # generated by the input_embedders in self.inputs. on the rest of the networks, pass
+                    # the existing input_placeholders into the input_embedders.
+                    if network_idx == 0:
+                        input_placeholder, embedding = input_embedder()
+                        self.inputs[input_name] = input_placeholder
+                    else:
+                        input_placeholder, embedding = input_embedder(self.inputs[input_name])
+
+                    state_embedding.append(embedding)
+
+                ##########
+                # Merger #
+                ##########
+
+                if len(state_embedding) == 1:
+                    state_embedding = state_embedding[0]
+                else:
+                    if self.network_parameters.embedding_merger_type == EmbeddingMergerType.Concat:
+                        state_embedding = tf.concat(state_embedding, axis=-1, name="merger")
+                    elif self.network_parameters.embedding_merger_type == EmbeddingMergerType.Sum:
+                        state_embedding = tf.add_n(state_embedding, name="merger")
+
+                ##############
+                # Middleware #
+                ##############
+
+                self.middleware = self.get_middleware(self.network_parameters.middleware_parameters)
+                _, self.state_embedding = self.middleware(state_embedding)
+
+                ################
+                # Output Heads #
+                ################
+
+                head_count = 0
+                for head_idx in range(self.num_heads_per_network):
+                    for head_copy_idx in range(self.network_parameters.num_output_head_copies):
+                        if self.network_parameters.use_separate_networks_per_head:
+                            # if we use separate networks per head, then the head type corresponds top the network idx
+                            head_type_idx = network_idx
+                            head_count = network_idx
+                        else:
+                            # if we use a single network with multiple embedders, then the head type is the current head idx
+                            head_type_idx = head_idx
+                        self.output_heads.append(
+                            self.get_output_head(self.network_parameters.heads_parameters[head_type_idx],
+                                                 head_copy_idx,
+                                                 self.network_parameters.loss_weights[head_type_idx])
+                        )
+
+                        # rescale the gradients from the head
+                        self.gradients_from_head_rescalers.append(
+                            tf.get_variable('gradients_from_head_{}-{}_rescalers'.format(head_idx, head_copy_idx),
+                                            initializer=float(
+                                                self.network_parameters.rescale_gradient_from_head_by_factor[head_count]
+                                            ),
+                                            dtype=tf.float32))
+
+                        self.gradients_from_head_rescalers_placeholders.append(
+                            tf.placeholder('float',
+                                           name='gradients_from_head_{}-{}_rescalers'.format(head_type_idx, head_copy_idx)))
+
+                        self.update_head_rescaler_value_ops.append(self.gradients_from_head_rescalers[head_count].assign(
+                            self.gradients_from_head_rescalers_placeholders[head_count]))
+
+                        head_input = (1-self.gradients_from_head_rescalers[head_count]) * tf.stop_gradient(self.state_embedding) + \
+                                     self.gradients_from_head_rescalers[head_count] * self.state_embedding
+
+                        # build the head
+                        if self.network_is_local:
+                            output, target_placeholder, input_placeholders, importance_weight_ph = \
+                                self.output_heads[-1](head_input)
+
+                            self.targets.extend(target_placeholder)
+                            self.importance_weights.extend(importance_weight_ph)
+                        else:
+                            output, input_placeholders = self.output_heads[-1](head_input)
+
+                        self.outputs.extend(output)
+                        # TODO: use head names as well
+                        for placeholder_index, input_placeholder in enumerate(input_placeholders):
+                            self.inputs['output_{}_{}'.format(head_type_idx, placeholder_index)] = input_placeholder
+
+                        head_count += 1
+
+        # Losses
+        self.losses = tf.losses.get_losses(self.full_name)
+        self.losses += tf.losses.get_regularization_losses(self.full_name)
+        self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.full_name)
+        # tf.summary.scalar('total_loss', self.total_loss)
+
+        # Learning rate
+        if self.network_parameters.learning_rate_decay_rate != 0:
+            self.adaptive_learning_rate_scheme = \
+                tf.train.exponential_decay(
+                    self.network_parameters.learning_rate,
+                    self.global_step,
+                    decay_steps=self.network_parameters.learning_rate_decay_steps,
+                    decay_rate=self.network_parameters.learning_rate_decay_rate,
+                    staircase=True)
+
+            self.current_learning_rate = self.adaptive_learning_rate_scheme
+        else:
+            self.current_learning_rate = self.network_parameters.learning_rate
+
+        # Optimizer
+        if self.distributed_training and self.network_is_local and self.network_parameters.shared_optimizer:
+            # distributed training + is a local network + optimizer shared -> take the global optimizer
+            self.optimizer = self.global_network.optimizer
+        elif (self.distributed_training and self.network_is_local and not self.network_parameters.shared_optimizer) \
+                or self.network_parameters.shared_optimizer or not self.distributed_training:
+            # distributed training + is a global network + optimizer shared
+            # OR
+            # distributed training + is a local network + optimizer not shared
+            # OR
+            # non-distributed training
+            # -> create an optimizer
+
+            if self.network_parameters.optimizer_type == 'Adam':
+                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.current_learning_rate,
+                                                        beta1=self.network_parameters.adam_optimizer_beta1,
+                                                        beta2=self.network_parameters.adam_optimizer_beta2,
+                                                        epsilon=self.network_parameters.optimizer_epsilon)
+            elif self.network_parameters.optimizer_type == 'RMSProp':
+                self.optimizer = tf.train.RMSPropOptimizer(self.current_learning_rate,
+                                                           decay=self.network_parameters.rms_prop_optimizer_decay,
+                                                           epsilon=self.network_parameters.optimizer_epsilon)
+            elif self.network_parameters.optimizer_type == 'LBFGS':
+                self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
+                                                                        options={'maxiter': 25})
+            else:
+                raise Exception("{} is not a valid optimizer type".format(self.network_parameters.optimizer_type))
+
+
--- a/rl_coach/architectures/tensorflow_components/heads/init.py
+++ b/rl_coach/architectures/tensorflow_components/heads/init.py
--- a/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class CategoricalQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
+        super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)
+
+
+class CategoricalQHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'categorical_dqn_head'
+        self.num_actions = len(self.spaces.action.actions)
+        self.num_atoms = agent_parameters.algorithm.atoms
+        self.return_type = QActionStateValue
+
+    def _build_module(self, input_layer):
+        self.actions = tf.placeholder(tf.int32, [None], name="actions")
+        self.input = [self.actions]
+
+        values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
+        values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
+                                                               self.num_atoms))
+        # softmax on atoms dimension
+        self.output = tf.nn.softmax(values_distribution)
+
+        # calculate cross entropy loss
+        self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
+                                            name="distributions")
+        self.target = self.distributions
+        self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
+        tf.losses.add_loss(self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/ddpg_actor_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ddpg_actor_head.py
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.core_types import ActionProbabilities
+
+
+class DDPGActorHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True):
+        super().__init__(parameterized_class=DDPGActor, activation_function=activation_function, name=name)
+        self.batchnorm = batchnorm
+
+
+class DDPGActor(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
+                 batchnorm: bool=True):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'ddpg_actor_head'
+        self.return_type = ActionProbabilities
+
+        self.num_actions = self.spaces.action.shape
+
+        self.batchnorm = batchnorm
+
+        # bounded actions
+        self.output_scale = self.spaces.action.max_abs_range
+
+        # a scalar weight that penalizes high activation values (before the activation function) for the final layer
+        if hasattr(agent_parameters.algorithm, 'action_penalty'):
+            self.action_penalty = agent_parameters.algorithm.action_penalty
+
+    def _build_module(self, input_layer):
+        # mean
+        pre_activation_policy_values_mean = tf.layers.dense(input_layer, self.num_actions, name='fc_mean')
+        policy_values_mean = batchnorm_activation_dropout(pre_activation_policy_values_mean, self.batchnorm,
+                                                          self.activation_function,
+                                                          False, 0, 0)[-1]
+        self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
+
+        if self.is_local:
+            # add a squared penalty on the squared pre-activation features of the action
+            if self.action_penalty and self.action_penalty != 0:
+                self.regularizations += \
+                    [self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
+
+        self.output = [self.policy_mean]
--- a/rl_coach/architectures/tensorflow_components/heads/dnd_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/dnd_q_head.py
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
+
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.memories.non_episodic import differentiable_neural_dictionary
+
+
+class DNDQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params'):
+        super().__init__(parameterized_class=DNDQHead, activation_function=activation_function, name=name)
+
+
+class DNDQHead(QHead):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'dnd_q_values_head'
+        self.DND_size = agent_parameters.algorithm.dnd_size
+        self.DND_key_error_threshold = agent_parameters.algorithm.DND_key_error_threshold
+        self.l2_norm_added_delta = agent_parameters.algorithm.l2_norm_added_delta
+        self.new_value_shift_coefficient = agent_parameters.algorithm.new_value_shift_coefficient
+        self.number_of_nn = agent_parameters.algorithm.number_of_knn
+        self.ap = agent_parameters
+        self.dnd_embeddings = [None] * self.num_actions
+        self.dnd_values = [None] * self.num_actions
+        self.dnd_indices = [None] * self.num_actions
+        self.dnd_distances = [None] * self.num_actions
+        if self.ap.memory.shared_memory:
+            self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
+
+    def _build_module(self, input_layer):
+        if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir') and self.ap.task_parameters.checkpoint_restore_dir:
+            self.DND = differentiable_neural_dictionary.load_dnd(self.ap.task_parameters.checkpoint_restore_dir)
+        else:
+            self.DND = differentiable_neural_dictionary.QDND(
+                self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
+                key_error_threshold=self.DND_key_error_threshold,
+                learning_rate=self.network_parameters.learning_rate,
+                num_neighbors=self.number_of_nn,
+                override_existing_keys=True)
+
+        # Retrieve info from DND dictionary
+        # We assume that all actions have enough entries in the DND
+        self.output = tf.transpose([
+            self._q_value(input_layer, action)
+            for action in range(self.num_actions)
+        ])
+
+    def _q_value(self, input_layer, action):
+        result = tf.py_func(self.DND.query,
+                            [input_layer, action, self.number_of_nn],
+                            [tf.float64, tf.float64, tf.int64])
+        self.dnd_embeddings[action] = tf.to_float(result[0])
+        self.dnd_values[action] = tf.to_float(result[1])
+        self.dnd_indices[action] = result[2]
+
+        # DND calculation
+        square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
+        distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
+        self.dnd_distances[action] = distances
+        weights = 1.0 / distances
+        normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
+        q_value = tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
+        q_value.set_shape((None,))
+        return q_value
+
+    def _post_build(self):
+        # DND gradients
+        self.dnd_embeddings_grad = tf.gradients(self.loss[0], self.dnd_embeddings)
+        self.dnd_values_grad = tf.gradients(self.loss[0], self.dnd_values)
--- a/rl_coach/architectures/tensorflow_components/heads/dueling_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/dueling_q_head.py
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
+
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
+from rl_coach.spaces import SpacesDefinition
+
+
+class DuelingQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='dueling_q_head_params'):
+        super().__init__(parameterized_class=DuelingQHead, activation_function=activation_function, name=name)
+
+
+class DuelingQHead(QHead):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'dueling_q_values_head'
+
+    def _build_module(self, input_layer):
+        # state value tower - V
+        with tf.variable_scope("state_value"):
+            state_value = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
+            state_value = tf.layers.dense(state_value, 1, name='fc2')
+            # state_value = tf.expand_dims(state_value, axis=-1)
+
+        # action advantage tower - A
+        with tf.variable_scope("action_advantage"):
+            action_advantage = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
+            action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2')
+            action_advantage = action_advantage - tf.reduce_mean(action_advantage)
+
+        # merge to state-action value function Q
+        self.output = tf.add(state_value, action_advantage, name='output')
--- a/rl_coach/architectures/tensorflow_components/heads/head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/head.py
@@ -0,0 +1,165 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Type
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters, Parameters
+from rl_coach.spaces import SpacesDefinition
+from tensorflow.python.ops.losses.losses_impl import Reduction
+
+from rl_coach.utils import force_list
+
+
+# Used to initialize weights for policy and value output layers
+def normalized_columns_initializer(std=1.0):
+    def _initializer(shape, dtype=None, partition_info=None):
+        out = np.random.randn(*shape).astype(np.float32)
+        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
+        return tf.constant(out)
+    return _initializer
+
+
+class HeadParameters(Parameters):
+    def __init__(self, parameterized_class: Type['Head'], activation_function: str = 'relu', name: str= 'head'):
+        super().__init__()
+        self.activation_function = activation_function
+        self.name = name
+        self.parameterized_class_name = parameterized_class.__name__
+
+
+class Head(object):
+    """
+    A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through
+    a neural network to produce the output of the network. There can be multiple heads in a network, and each one has
+    an assigned loss function. The heads are algorithm dependent.
+    """
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int=0, loss_weight: float=1., is_local: bool=True, activation_function: str='relu'):
+        self.head_idx = head_idx
+        self.network_name = network_name
+        self.network_parameters = agent_parameters.network_wrappers[self.network_name]
+        self.name = "head"
+        self.output = []
+        self.loss = []
+        self.loss_type = []
+        self.regularizations = []
+        self.loss_weight = force_list(loss_weight)
+        self.target = []
+        self.importance_weight = []
+        self.input = []
+        self.is_local = is_local
+        self.ap = agent_parameters
+        self.spaces = spaces
+        self.return_type = None
+        self.activation_function = activation_function
+
+    def __call__(self, input_layer):
+        """
+        Wrapper for building the module graph including scoping and loss creation
+        :param input_layer: the input to the graph
+        :return: the output of the last layer and the target placeholder
+        """
+        with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
+            self._build_module(input_layer)
+
+            self.output = force_list(self.output)
+            self.target = force_list(self.target)
+            self.input = force_list(self.input)
+            self.loss_type = force_list(self.loss_type)
+            self.loss = force_list(self.loss)
+            self.regularizations = force_list(self.regularizations)
+            if self.is_local:
+                self.set_loss()
+            self._post_build()
+
+        if self.is_local:
+            return self.output, self.target, self.input, self.importance_weight
+        else:
+            return self.output, self.input
+
+    def _build_module(self, input_layer):
+        """
+        Builds the graph of the module
+        This method is called early on from __call__. It is expected to store the graph
+        in self.output.
+        :param input_layer: the input to the graph
+        :return: None
+        """
+        pass
+
+    def _post_build(self):
+        """
+        Optional function that allows adding any extra definitions after the head has been fully defined
+        For example, this allows doing additional calculations that are based on the loss
+        :return: None
+        """
+        pass
+
+    def get_name(self):
+        """
+        Get a formatted name for the module
+        :return: the formatted name
+        """
+        return '{}_{}'.format(self.name, self.head_idx)
+
+    def set_loss(self):
+        """
+        Creates a target placeholder and loss function for each loss_type and regularization
+        :param loss_type: a tensorflow loss function
+        :param scope: the name scope to include the tensors in
+        :return: None
+        """
+
+        # there are heads that define the loss internally, but we need to create additional placeholders for them
+        for idx in range(len(self.loss)):
+            importance_weight = tf.placeholder('float',
+                                               [None] + [1] * (len(self.target[idx].shape) - 1),
+                                               '{}_importance_weight'.format(self.get_name()))
+            self.importance_weight.append(importance_weight)
+
+        # add losses and target placeholder
+        for idx in range(len(self.loss_type)):
+            # create target placeholder
+            target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name()))
+            self.target.append(target)
+
+            # create importance sampling weights placeholder
+            num_target_dims = len(self.target[idx].shape)
+            importance_weight = tf.placeholder('float', [None] + [1] * (num_target_dims - 1),
+                                               '{}_importance_weight'.format(self.get_name()))
+            self.importance_weight.append(importance_weight)
+
+            # compute the weighted loss. importance_weight weights over the samples in the batch, while self.loss_weight
+            # weights the specific loss of this head against other losses in this head or in other heads
+            loss_weight = self.loss_weight[idx]*importance_weight
+            loss = self.loss_type[idx](self.target[-1], self.output[idx],
+                                       scope=self.get_name(), reduction=Reduction.NONE, loss_collection=None)
+
+            # the loss is first summed over each sample in the batch and then the mean over the batch is taken
+            loss = tf.reduce_mean(loss_weight*tf.reduce_sum(loss, axis=list(range(1, num_target_dims))))
+
+            # we add the loss to the losses collection and later we will extract it in general_network
+            tf.losses.add_loss(loss)
+            self.loss.append(loss)
+
+        # add regularizations
+        for regularization in self.regularizations:
+            self.loss.append(regularization)
+
+    @classmethod
+    def path(cls):
+        return cls.__class__.__name__
--- a/rl_coach/architectures/tensorflow_components/heads/measurements_prediction_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/measurements_prediction_head.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import Measurements
+
+
+class MeasurementsPredictionHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='measurements_prediction_head_params'):
+        super().__init__(parameterized_class=MeasurementsPredictionHead,
+                         activation_function=activation_function, name=name)
+
+
+class MeasurementsPredictionHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'future_measurements_head'
+        self.num_actions = len(self.spaces.action.actions)
+        self.num_measurements = self.spaces.state['measurements'].shape[0]
+        self.num_prediction_steps = agent_parameters.algorithm.num_predicted_steps_ahead
+        self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps
+        self.return_type = Measurements
+
+    def _build_module(self, input_layer):
+        # This is almost exactly the same as Dueling Network but we predict the future measurements for each action
+        # actions expectation tower (expectation stream) - E
+        with tf.variable_scope("expectation_stream"):
+            expectation_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
+            expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output')
+            expectation_stream = tf.expand_dims(expectation_stream, axis=1)
+
+        # action fine differences tower (action stream) - A
+        with tf.variable_scope("action_stream"):
+            action_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
+            action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size,
+                                            name='output')
+            action_stream = tf.reshape(action_stream,
+                                       (tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
+            action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
+
+        # merge to future measurements predictions
+        self.output = tf.add(expectation_stream, action_stream, name='output')
+        self.target = tf.placeholder(tf.float32, [None, self.num_actions, self.multi_step_measurements_size],
+                                     name="targets")
+        targets_nonan = tf.where(tf.is_nan(self.target), self.output, self.target)
+        self.loss = tf.reduce_sum(tf.reduce_mean(tf.square(targets_nonan - self.output), reduction_indices=0))
+        tf.losses.add_loss(self.loss_weight[0] * self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/naf_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/naf_head.py
@@ -0,0 +1,88 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import BoxActionSpace
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class NAFHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='naf_head_params'):
+        super().__init__(parameterized_class=NAFHead, activation_function=activation_function, name=name)
+
+
+class NAFHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True,activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        if not isinstance(self.spaces.action, BoxActionSpace):
+            raise ValueError("NAF works only for continuous action spaces (BoxActionSpace)")
+
+        self.name = 'naf_q_values_head'
+        self.num_actions = self.spaces.action.shape[0]
+        self.output_scale = self.spaces.action.max_abs_range
+        self.return_type = QActionStateValue
+        if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
+            self.loss_type = tf.losses.huber_loss
+        else:
+            self.loss_type = tf.losses.mean_squared_error
+
+    def _build_module(self, input_layer):
+        # NAF
+        self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action")
+        self.input = self.action
+
+        # V Head
+        self.V = tf.layers.dense(input_layer, 1, name='V')
+
+        # mu Head
+        mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=self.activation_function, name='mu_unscaled')
+        self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu')
+
+        # A Head
+        # l_vector is a vector that includes a lower-triangular matrix values
+        self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector')
+
+        # Convert l to a lower triangular matrix and exponentiate its diagonal
+
+        i = 0
+        columns = []
+        for col in range(self.num_actions):
+            start_row = col
+            num_non_zero_elements = self.num_actions - start_row
+            zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row])
+            diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1)
+            non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)]
+            columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1))
+            i += num_non_zero_elements
+        self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1))
+
+        # P = L*L^T
+        self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1)))
+
+        # A = -1/2 * (u - mu)^T * P * (u - mu)
+        action_diff = tf.expand_dims(self.action - self.mu, -1)
+        a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff))
+        self.A = tf.reshape(a_matrix_form, [-1, 1])
+
+        # Q Head
+        self.Q = tf.add(self.V, self.A, name='Q')
+
+        self.output = self.Q
--- a/rl_coach/architectures/tensorflow_components/heads/policy_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/policy_head.py
@@ -0,0 +1,151 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, CompoundActionSpace
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+from rl_coach.core_types import ActionProbabilities
+from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
+
+
+class PolicyHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='policy_head_params'):
+        super().__init__(parameterized_class=PolicyHead, activation_function=activation_function, name=name)
+
+
+class PolicyHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'policy_values_head'
+        self.return_type = ActionProbabilities
+        self.beta = None
+        self.action_penalty = None
+
+        self.exploration_policy = agent_parameters.exploration
+
+        # a scalar weight that penalizes low entropy values to encourage exploration
+        if hasattr(agent_parameters.algorithm, 'beta_entropy'):
+            self.beta = agent_parameters.algorithm.beta_entropy
+
+        # a scalar weight that penalizes high activation values (before the activation function) for the final layer
+        if hasattr(agent_parameters.algorithm, 'action_penalty'):
+            self.action_penalty = agent_parameters.algorithm.action_penalty
+
+    def _build_module(self, input_layer):
+        self.actions = []
+        self.input = self.actions
+        self.policy_distributions = []
+        self.output = []
+
+        action_spaces = [self.spaces.action]
+        if isinstance(self.spaces.action, CompoundActionSpace):
+            action_spaces = self.spaces.action.sub_action_spaces
+
+        # create a compound action network
+        for action_space_idx, action_space in enumerate(action_spaces):
+            with tf.variable_scope("sub_action_{}".format(action_space_idx)):
+                if isinstance(action_space, DiscreteActionSpace):
+                    # create a discrete action network (softmax probabilities output)
+                    self._build_discrete_net(input_layer, action_space)
+                elif isinstance(action_space, BoxActionSpace):
+                    # create a continuous action network (bounded mean and stdev outputs)
+                    self._build_continuous_net(input_layer, action_space)
+
+        if self.is_local:
+            # add entropy regularization
+            if self.beta:
+                self.entropy = tf.add_n([tf.reduce_mean(dist.entropy()) for dist in self.policy_distributions])
+                self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
+
+            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+            # calculate loss
+            self.action_log_probs_wrt_policy = \
+                tf.add_n([dist.log_prob(action) for dist, action in zip(self.policy_distributions, self.actions)])
+            self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
+            self.target = self.advantages
+            self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages)
+            tf.losses.add_loss(self.loss_weight[0] * self.loss)
+
+    def _build_discrete_net(self, input_layer, action_space):
+        num_actions = len(action_space.actions)
+        self.actions.append(tf.placeholder(tf.int32, [None], name="actions"))
+
+        policy_values = tf.layers.dense(input_layer, num_actions, name='fc')
+        self.policy_probs = tf.nn.softmax(policy_values, name="policy")
+
+        # define the distributions for the policy and the old policy
+        # (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
+        policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
+        self.policy_distributions.append(policy_distribution)
+        self.output.append(self.policy_probs)
+
+    def _build_continuous_net(self, input_layer, action_space):
+        num_actions = action_space.shape
+        self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))
+
+        # output activation function
+        if np.all(self.spaces.action.max_abs_range < np.inf):
+            # bounded actions
+            self.output_scale = action_space.max_abs_range
+            self.continuous_output_activation = self.activation_function
+        else:
+            # unbounded actions
+            self.output_scale = 1
+            self.continuous_output_activation = None
+
+        # mean
+        pre_activation_policy_values_mean = tf.layers.dense(input_layer, num_actions, name='fc_mean')
+        policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
+        self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
+
+        self.output.append(self.policy_mean)
+
+        # standard deviation
+        if isinstance(self.exploration_policy, ContinuousEntropyParameters):
+            # the stdev is an output of the network and uses a softplus activation as defined in A3C
+            policy_values_std = tf.layers.dense(input_layer, num_actions,
+                                                kernel_initializer=normalized_columns_initializer(0.01), name='fc_std')
+            self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps
+
+            self.output.append(self.policy_std)
+        else:
+            # the stdev is an externally given value
+            # Warning: we need to explicitly put this variable in the local variables collections, since defining
+            # it as not trainable puts it for some reason in the global variables collections. If this is not done,
+            # the variable won't be initialized and when working with multiple workers they will get stuck.
+            self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
+                                          name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])
+
+            # assign op for the policy std
+            self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
+            self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)
+
+        # define the distributions for the policy and the old policy
+        policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
+        self.policy_distributions.append(policy_distribution)
+
+        if self.is_local:
+            # add a squared penalty on the squared pre-activation features of the action
+            if self.action_penalty and self.action_penalty != 0:
+                self.regularizations += [
+                    self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
--- a/rl_coach/architectures/tensorflow_components/heads/ppo_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ppo_head.py
@@ -0,0 +1,144 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters, normalized_columns_initializer
+from rl_coach.core_types import ActionProbabilities
+
+
+class PPOHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'):
+        super().__init__(parameterized_class=PPOHead, activation_function=activation_function, name=name)
+
+
+class PPOHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'ppo_head'
+        self.return_type = ActionProbabilities
+
+        # used in regular PPO
+        self.use_kl_regularization = agent_parameters.algorithm.use_kl_regularization
+        if self.use_kl_regularization:
+            # kl coefficient and its corresponding assignment operation and placeholder
+            self.kl_coefficient = tf.Variable(agent_parameters.algorithm.initial_kl_coefficient,
+                                              trainable=False, name='kl_coefficient')
+            self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
+            self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
+            self.kl_cutoff = 2 * agent_parameters.algorithm.target_kl_divergence
+            self.high_kl_penalty_coefficient = agent_parameters.algorithm.high_kl_penalty_coefficient
+
+        self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
+        self.beta = agent_parameters.algorithm.beta_entropy
+
+    def _build_module(self, input_layer):
+        if isinstance(self.spaces.action, DiscreteActionSpace):
+            self._build_discrete_net(input_layer, self.spaces.action)
+        elif isinstance(self.spaces.action, BoxActionSpace):
+            self._build_continuous_net(input_layer, self.spaces.action)
+        else:
+            raise ValueError("only discrete or continuous action spaces are supported for PPO")
+
+        self.action_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
+        self.action_probs_wrt_old_policy = self.old_policy_distribution.log_prob(self.actions)
+        self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
+
+        # Used by regular PPO only
+        # add kl divergence regularization
+        self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(self.old_policy_distribution, self.policy_distribution))
+
+        if self.use_kl_regularization:
+            # no clipping => use kl regularization
+            self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence)
+            self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \
+                                                tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff))
+            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+        # calculate surrogate loss
+        self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
+        self.target = self.advantages
+        # action_probs_wrt_old_policy != 0 because it is e^...
+        self.likelihood_ratio = tf.exp(self.action_probs_wrt_policy - self.action_probs_wrt_old_policy)
+        if self.clip_likelihood_ratio_using_epsilon is not None:
+            self.clip_param_rescaler = tf.placeholder(tf.float32, ())
+            self.input.append(self.clip_param_rescaler)
+            max_value = 1 + self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
+            min_value = 1 - self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
+            self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value)
+            self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages,
+                                                self.clipped_likelihood_ratio * self.advantages)
+        else:
+            self.scaled_advantages = self.likelihood_ratio * self.advantages
+        # minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss)
+        self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages)
+        if self.is_local:
+            # add entropy regularization
+            if self.beta:
+                self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
+                self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization')
+                tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
+
+        self.loss = self.surrogate_loss
+        tf.losses.add_loss(self.loss)
+
+    def _build_discrete_net(self, input_layer, action_space):
+        num_actions = len(action_space.actions)
+        self.actions = tf.placeholder(tf.int32, [None], name="actions")
+
+        self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
+        self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
+
+        # Policy Head
+        self.input = [self.actions, self.old_policy_mean]
+        policy_values = tf.layers.dense(input_layer, num_actions, name='policy_fc')
+        self.policy_mean = tf.nn.softmax(policy_values, name="policy")
+
+        # define the distributions for the policy and the old policy
+        self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean)
+        self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
+
+        self.output = self.policy_mean
+
+    def _build_continuous_net(self, input_layer, action_space):
+        num_actions = action_space.shape[0]
+        self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")
+
+        self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
+        self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
+
+        self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
+        self.policy_mean = tf.layers.dense(input_layer, num_actions, name='policy_mean',
+                                           kernel_initializer=normalized_columns_initializer(0.01))
+        if self.is_local:
+            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
+                                            collections=[tf.GraphKeys.LOCAL_VARIABLES])
+        else:
+            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')
+
+        self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')
+
+        # define the distributions for the policy and the old policy
+        self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
+        self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)
+
+        self.output = [self.policy_mean, self.policy_std]
--- a/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
+from rl_coach.core_types import ActionProbabilities
+
+
+class PPOVHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params'):
+        super().__init__(parameterized_class=PPOVHead, activation_function=activation_function, name=name)
+
+
+class PPOVHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'ppo_v_head'
+        self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
+        self.return_type = ActionProbabilities
+
+    def _build_module(self, input_layer):
+        self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
+        self.input = [self.old_policy_value]
+        self.output = tf.layers.dense(input_layer, 1, name='output',
+                                            kernel_initializer=normalized_columns_initializer(1.0))
+        self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
+
+        value_loss_1 = tf.square(self.output - self.target)
+        value_loss_2 = tf.square(self.old_policy_value +
+                                 tf.clip_by_value(self.output - self.old_policy_value,
+                                                  -self.clip_likelihood_ratio_using_epsilon,
+                                                  self.clip_likelihood_ratio_using_epsilon) - self.target)
+        self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
+        self.loss = self.vf_loss
+        tf.losses.add_loss(self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/q_head.py
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpace
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class QHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='q_head_params'):
+        super().__init__(parameterized_class=QHead, activation_function=activation_function, name=name)
+
+
+class QHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'q_values_head'
+        if isinstance(self.spaces.action, BoxActionSpace):
+            self.num_actions = 1
+        elif isinstance(self.spaces.action, DiscreteActionSpace):
+            self.num_actions = len(self.spaces.action.actions)
+        self.return_type = QActionStateValue
+        if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
+            self.loss_type = tf.losses.huber_loss
+        else:
+            self.loss_type = tf.losses.mean_squared_error
+
+    def _build_module(self, input_layer):
+        # Standard Q Network
+        self.output = tf.layers.dense(input_layer, self.num_actions, name='output')
+
+
+
--- a/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
+from rl_coach.core_types import QActionStateValue
+
+
+class QuantileRegressionQHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params'):
+        super().__init__(parameterized_class=QuantileRegressionQHead, activation_function=activation_function,
+                         name=name)
+
+
+class QuantileRegressionQHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'quantile_regression_dqn_head'
+        self.num_actions = len(self.spaces.action.actions)
+        self.num_atoms = agent_parameters.algorithm.atoms  # we use atom / quantile interchangeably
+        self.huber_loss_interval = agent_parameters.algorithm.huber_loss_interval  # k
+        self.return_type = QActionStateValue
+
+    def _build_module(self, input_layer):
+        self.actions = tf.placeholder(tf.int32, [None, 2], name="actions")
+        self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints")
+        self.input = [self.actions, self.quantile_midpoints]
+
+        # the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
+        quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
+        quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
+        self.output = quantiles_locations
+
+        self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles")
+        self.target = self.quantiles
+
+        # only the quantiles of the taken action are taken into account
+        quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions)
+
+        # reorder the output quantiles and the target quantiles as a preparation step for calculating the loss
+        # the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles)
+        # the target quantiles vector is tiled as column of a NxN matrix
+        theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms])
+        T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1])
+        tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms])
+
+        # Huber loss of T(theta_j) - theta_i
+        error = T_theta_j - theta_i
+        abs_error = tf.abs(error)
+        quadratic = tf.minimum(abs_error, self.huber_loss_interval)
+        huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2
+
+        # Quantile Huber loss
+        quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss
+
+        # Quantile regression loss (the probability for each quantile is 1/num_quantiles)
+        quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms)
+        self.loss = quantile_regression_loss
+        tf.losses.add_loss(self.loss)
--- a/rl_coach/architectures/tensorflow_components/heads/v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/v_head.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.spaces import SpacesDefinition
+
+from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
+from rl_coach.core_types import VStateValue
+
+
+class VHeadParameters(HeadParameters):
+    def __init__(self, activation_function: str ='relu', name: str='v_head_params'):
+        super().__init__(parameterized_class=VHead, activation_function=activation_function, name=name)
+
+
+class VHead(Head):
+    def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+                 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
+        super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
+        self.name = 'v_values_head'
+        self.return_type = VStateValue
+
+        if agent_parameters.network_wrappers[self.network_name.split('/')[0]].replace_mse_with_huber_loss:
+            self.loss_type = tf.losses.huber_loss
+        else:
+            self.loss_type = tf.losses.mean_squared_error
+
+    def _build_module(self, input_layer):
+        # Standard V Network
+        self.output = tf.layers.dense(input_layer, 1, name='output',
+                                      kernel_initializer=normalized_columns_initializer(1.0))
--- a/rl_coach/architectures/tensorflow_components/middlewares/init.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/init.py
--- a/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Union, List
+
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
+from rl_coach.base_parameters import MiddlewareScheme
+
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout, Dense
+from rl_coach.core_types import Middleware_FC_Embedding
+
+
+class FCMiddlewareParameters(MiddlewareParameters):
+    def __init__(self, activation_function='relu',
+                 scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_fc_embedder"):
+        super().__init__(parameterized_class=FCMiddleware, activation_function=activation_function,
+                         scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
+
+
+class FCMiddleware(Middleware):
+    schemes = {
+        MiddlewareScheme.Empty:
+            [],
+
+        # ppo
+        MiddlewareScheme.Shallow:
+            [
+                Dense([64])
+            ],
+
+        # dqn
+        MiddlewareScheme.Medium:
+            [
+                Dense([512])
+            ],
+
+        MiddlewareScheme.Deep: \
+            [
+                Dense([128]),
+                Dense([128]),
+                Dense([128])
+            ]
+    }
+
+    def __init__(self, activation_function=tf.nn.relu,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_fc_embedder"):
+        super().__init__(activation_function=activation_function, batchnorm=batchnorm,
+                         dropout=dropout, scheme=scheme, name=name)
+        self.return_type = Middleware_FC_Embedding
+        self.layers = []
+
+    def _build_module(self):
+        self.layers.append(self.input)
+
+        if isinstance(self.scheme, MiddlewareScheme):
+            layers_params = FCMiddleware.schemes[self.scheme]
+        else:
+            layers_params = self.scheme
+        for idx, layer_params in enumerate(layers_params):
+            self.layers.append(
+                layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
+            )
+
+            self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
+                                                            self.activation_function, self.dropout,
+                                                            self.dropout_rate, idx))
+
+        self.output = self.layers[-1]
+
--- a/rl_coach/architectures/tensorflow_components/middlewares/lstm_middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/lstm_middleware.py
@@ -0,0 +1,113 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import numpy as np
+import tensorflow as tf
+from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
+from rl_coach.base_parameters import MiddlewareScheme
+
+from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
+from rl_coach.core_types import Middleware_LSTM_Embedding
+
+
+class LSTMMiddlewareParameters(MiddlewareParameters):
+    def __init__(self, activation_function='relu', number_of_lstm_cells=256,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_lstm_embedder"):
+        super().__init__(parameterized_class=LSTMMiddleware, activation_function=activation_function,
+                         scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
+        self.number_of_lstm_cells = number_of_lstm_cells
+
+
+class LSTMMiddleware(Middleware):
+    schemes = {
+        MiddlewareScheme.Empty:
+            [],
+
+        # ppo
+        MiddlewareScheme.Shallow:
+            [
+                [64]
+            ],
+
+        # dqn
+        MiddlewareScheme.Medium:
+            [
+                [512]
+            ],
+
+        MiddlewareScheme.Deep: \
+            [
+                [128],
+                [128],
+                [128]
+            ]
+    }
+
+    def __init__(self, activation_function=tf.nn.relu, number_of_lstm_cells: int=256,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False,
+                 name="middleware_lstm_embedder"):
+        super().__init__(activation_function=activation_function, batchnorm=batchnorm,
+                         dropout=dropout, scheme=scheme, name=name)
+        self.return_type = Middleware_LSTM_Embedding
+        self.number_of_lstm_cells = number_of_lstm_cells
+        self.layers = []
+
+    def _build_module(self):
+        """
+        self.state_in: tuple of placeholders containing the initial state
+        self.state_out: tuple of output state
+
+        todo: it appears that the shape of the output is batch, feature
+        the code here seems to be slicing off the first element in the batch
+        which would definitely be wrong. need to double check the shape
+        """
+
+        self.layers.append(self.input)
+
+        # optionally insert some dense layers before the LSTM
+        if isinstance(self.scheme, MiddlewareScheme):
+            layers_params = LSTMMiddleware.schemes[self.scheme]
+        else:
+            layers_params = self.scheme
+        for idx, layer_params in enumerate(layers_params):
+            self.layers.append(
+                tf.layers.dense(self.layers[-1], layer_params[0], name='fc{}'.format(idx))
+            )
+
+            self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
+                                                            self.activation_function, self.dropout,
+                                                            self.dropout_rate, idx))
+
+        # add the LSTM layer
+        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
+        self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
+        self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
+        self.state_init = [self.c_init, self.h_init]
+        self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
+        self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
+        self.state_in = (self.c_in, self.h_in)
+        rnn_in = tf.expand_dims(self.layers[-1], [0])
+        step_size = tf.shape(self.layers[-1])[:1]
+        state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
+        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
+            lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
+        lstm_c, lstm_h = lstm_state
+        self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
+        self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])
--- a/rl_coach/architectures/tensorflow_components/middlewares/middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/middleware.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Type, Union, List
+
+import tensorflow as tf
+from rl_coach.base_parameters import MiddlewareScheme, Parameters
+
+from rl_coach.core_types import MiddlewareEmbedding
+
+
+class MiddlewareParameters(Parameters):
+    def __init__(self, parameterized_class: Type['Middleware'],
+                 activation_function: str='relu', scheme: Union[List, MiddlewareScheme]=MiddlewareScheme.Medium,
+                 batchnorm: bool=False, dropout: bool=False,
+                 name='middleware'):
+        super().__init__()
+        self.activation_function = activation_function
+        self.scheme = scheme
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+        self.name = name
+        self.parameterized_class_name = parameterized_class.__name__
+
+
+class Middleware(object):
+    """
+    A middleware embedder is the middle part of the network. It takes the embeddings from the input embedders,
+    after they were aggregated in some method (for example, concatenation) and passes it through a neural network
+    which can be customizable but shared between the heads of the network
+    """
+    def __init__(self, activation_function=tf.nn.relu,
+                 scheme: MiddlewareScheme = MiddlewareScheme.Medium,
+                 batchnorm: bool = False, dropout: bool = False, name="middleware_embedder"):
+        self.name = name
+        self.input = None
+        self.output = None
+        self.activation_function = activation_function
+        self.batchnorm = batchnorm
+        self.dropout = dropout
+        self.dropout_rate = 0
+        self.scheme = scheme
+        self.return_type = MiddlewareEmbedding
+
+    def __call__(self, input_layer):
+        with tf.variable_scope(self.get_name()):
+            self.input = input_layer
+            self._build_module()
+
+        return self.input, self.output
+
+    def _build_module(self):
+        pass
+
+    def get_name(self):
+        return self.name
--- a/rl_coach/architectures/tensorflow_components/shared_variables.py
+++ b/rl_coach/architectures/tensorflow_components/shared_variables.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import tensorflow as tf
+
+
+class SharedRunningStats(object):
+    def __init__(self, replicated_device=None, epsilon=1e-2, name="", create_ops=True):
+        self.sess = None
+        self.name = name
+        self.replicated_device = replicated_device
+        self.epsilon = epsilon
+        self.ops_were_created = False
+        if create_ops:
+            with tf.device(replicated_device):
+                self.create_ops()
+
+    def create_ops(self, shape=[1], clip_values=None):
+        self.clip_values = clip_values
+        with tf.variable_scope(self.name):
+            self._sum = tf.get_variable(
+                dtype=tf.float64,
+                initializer=tf.constant_initializer(0.0),
+                name="running_sum", trainable=False, shape=shape, validate_shape=False,
+                collections=[tf.GraphKeys.GLOBAL_VARIABLES])
+            self._sum_squared = tf.get_variable(
+                dtype=tf.float64,
+                initializer=tf.constant_initializer(self.epsilon),
+                name="running_sum_squared", trainable=False, shape=shape, validate_shape=False,
+                collections=[tf.GraphKeys.GLOBAL_VARIABLES])
+            self._count = tf.get_variable(
+                dtype=tf.float64,
+                shape=(),
+                initializer=tf.constant_initializer(self.epsilon),
+                name="count", trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
+
+            self._shape = None
+            self._mean = tf.div(self._sum, self._count, name="mean")
+            self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean))
+                                           / tf.maximum(self._count-1, 1), self.epsilon), name="stdev")
+            self.tf_mean = tf.cast(self._mean, 'float32')
+            self.tf_std = tf.cast(self._std, 'float32')
+
+            self.new_sum = tf.placeholder(dtype=tf.float64, name='sum')
+            self.new_sum_squared = tf.placeholder(dtype=tf.float64, name='var')
+            self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
+
+            self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True)
+            self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True)
+            self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True)
+
+            self.raw_obs = tf.placeholder(dtype=tf.float64, name='raw_obs')
+            self.normalized_obs = (self.raw_obs - self._mean) / self._std
+            if self.clip_values is not None:
+                self.clipped_obs = tf.clip_by_value(self.normalized_obs, self.clip_values[0], self.clip_values[1])
+
+            self.ops_were_created = True
+
+    def set_session(self, sess):
+        self.sess = sess
+
+    def push(self, x):
+        x = x.astype('float64')
+        self.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count],
+                         feed_dict={
+                             self.new_sum: x.sum(axis=0).ravel(),
+                             self.new_sum_squared: np.square(x).sum(axis=0).ravel(),
+                             self.newcount: np.array(len(x), dtype='float64')
+                         })
+        if self._shape is None:
+            self._shape = x.shape
+
+    @property
+    def n(self):
+        return self.sess.run(self._count)
+
+    @property
+    def mean(self):
+        return self.sess.run(self._mean)
+
+    @property
+    def var(self):
+        return self.std ** 2
+
+    @property
+    def std(self):
+        return self.sess.run(self._std)
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @shape.setter
+    def shape(self, val):
+        self._shape = val
+        self.new_sum.set_shape(val)
+        self.new_sum_squared.set_shape(val)
+        self.tf_mean.set_shape(val)
+        self.tf_std.set_shape(val)
+        self._sum.set_shape(val)
+        self._sum_squared.set_shape(val)
+
+    def normalize(self, batch):
+        if self.clip_values is not None:
+            return self.sess.run(self.clipped_obs, feed_dict={self.raw_obs: batch})
+        else:
+            return self.sess.run(self.normalized_obs, feed_dict={self.raw_obs: batch})