Enabling Coach Documentation to be run even when environments are not installed (#326)

2026-03-19 08:23:33 +01:00 · 2019-05-27 10:46:07 +03:00
parent 2b7d536da4
commit 342b7184bc
157 changed files with 5167 additions and 7477 deletions
--- a/docs/components/additional_parameters.html
+++ b/docs/components/additional_parameters.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Additional Parameters &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Additional Parameters &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../_static/jquery.js"></script>
+        <script type="text/javascript" src="../_static/underscore.js"></script>
+        <script type="text/javascript" src="../_static/doctools.js"></script>
+        <script type="text/javascript" src="../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
@@ -32,21 +40,16 @@
    <link rel="prev" title="Spaces" href="spaces.html" />
    <link href="../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -193,51 +196,47 @@
 <dl class="class">
 <dt id="rl_coach.base_parameters.VisualizationParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">VisualizationParameters</code><span class="sig-paren">(</span><em>print_networks_summary=False</em>, <em>dump_csv=True</em>, <em>dump_signals_to_csv_every_x_episodes=5</em>, <em>dump_gifs=False</em>, <em>dump_mp4=False</em>, <em>video_dump_methods=None</em>, <em>dump_in_episode_signals=False</em>, <em>dump_parameters_documentation=True</em>, <em>render=False</em>, <em>native_rendering=False</em>, <em>max_fps_for_human_control=10</em>, <em>tensorboard=False</em>, <em>add_rendered_image_to_env_response=False</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#VisualizationParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.VisualizationParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>print_networks_summary</strong> – If set to True, a summary of all the networks structure will be printed at the beginning of the experiment</li>
-<li><strong>dump_csv</strong> – If set to True, the logger will dump logs to a csv file once in every dump_signals_to_csv_every_x_episodes
-episodes. The logs can be later used to visualize the training process using Coach Dashboard.</li>
-<li><strong>dump_signals_to_csv_every_x_episodes</strong> – Defines the number of episodes between writing new data to the csv log files. Lower values can affect
-performance, as writing to disk may take time, and it is done synchronously.</li>
-<li><strong>dump_gifs</strong> – If set to True, GIF videos of the environment will be stored into the experiment directory according to
-the filters defined in video_dump_methods.</li>
-<li><strong>dump_mp4</strong> – If set to True, MP4 videos of the environment will be stored into the experiment directory according to
-the filters defined in video_dump_methods.</li>
-<li><strong>dump_in_episode_signals</strong> – If set to True, csv files will be dumped for each episode for inspecting different metrics within the
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>print_networks_summary</strong> – If set to True, a summary of all the networks structure will be printed at the beginning of the experiment</p></li>
+<li><p><strong>dump_csv</strong> – If set to True, the logger will dump logs to a csv file once in every dump_signals_to_csv_every_x_episodes
+episodes. The logs can be later used to visualize the training process using Coach Dashboard.</p></li>
+<li><p><strong>dump_signals_to_csv_every_x_episodes</strong> – Defines the number of episodes between writing new data to the csv log files. Lower values can affect
+performance, as writing to disk may take time, and it is done synchronously.</p></li>
+<li><p><strong>dump_gifs</strong> – If set to True, GIF videos of the environment will be stored into the experiment directory according to
+the filters defined in video_dump_methods.</p></li>
+<li><p><strong>dump_mp4</strong> – If set to True, MP4 videos of the environment will be stored into the experiment directory according to
+the filters defined in video_dump_methods.</p></li>
+<li><p><strong>dump_in_episode_signals</strong> – If set to True, csv files will be dumped for each episode for inspecting different metrics within the
 episode. This means that for each step in each episode, different metrics such as the reward, the
 future return, etc. will be saved. Setting this to True may affect performance severely, and therefore
-this should be used only for debugging purposes.</li>
-<li><strong>dump_parameters_documentation</strong> – If set to True, a json file containing all the agent parameters will be saved in the experiment directory.
+this should be used only for debugging purposes.</p></li>
+<li><p><strong>dump_parameters_documentation</strong> – If set to True, a json file containing all the agent parameters will be saved in the experiment directory.
 This may be very useful for inspecting the values defined for each parameters and making sure that all
-the parameters are defined as expected.</li>
-<li><strong>render</strong> – If set to True, the environment render function will be called for each step, rendering the image of the
+the parameters are defined as expected.</p></li>
+<li><p><strong>render</strong> – If set to True, the environment render function will be called for each step, rendering the image of the
 environment. This may affect the performance of training, and is highly dependent on the environment.
 By default, Coach uses PyGame to render the environment image instead of the environment specific rendered.
-To change this, use the native_rendering flag.</li>
-<li><strong>native_rendering</strong> – If set to True, the environment native renderer will be used for rendering the environment image.
+To change this, use the native_rendering flag.</p></li>
+<li><p><strong>native_rendering</strong> – If set to True, the environment native renderer will be used for rendering the environment image.
 In some cases this can be slower than rendering using PyGame through Coach, but in other cases the
-environment opens its native renderer by default, so rendering with PyGame is an unnecessary overhead.</li>
-<li><strong>max_fps_for_human_control</strong> – The maximum number of frames per second used while playing the environment as a human. This only has
-effect while using the –play flag for Coach.</li>
-<li><strong>tensorboard</strong> – If set to True, TensorBoard summaries will be stored in the experiment directory. This can later be
-loaded in TensorBoard in order to visualize the training process.</li>
-<li><strong>video_dump_methods</strong> – A list of dump methods that will be used as filters for deciding when to save videos.
+environment opens its native renderer by default, so rendering with PyGame is an unnecessary overhead.</p></li>
+<li><p><strong>max_fps_for_human_control</strong> – The maximum number of frames per second used while playing the environment as a human. This only has
+effect while using the –play flag for Coach.</p></li>
+<li><p><strong>tensorboard</strong> – If set to True, TensorBoard summaries will be stored in the experiment directory. This can later be
+loaded in TensorBoard in order to visualize the training process.</p></li>
+<li><p><strong>video_dump_methods</strong> – A list of dump methods that will be used as filters for deciding when to save videos.
 The filters in the list will be checked one after the other until the first dump method that returns
 false for should_dump() in the environment class. This list will only be used if dump_mp4 or dump_gif are
-set to True.</li>
-<li><strong>add_rendered_image_to_env_response</strong> – Some environments have a different observation compared to the one displayed while rendering.
+set to True.</p></li>
+<li><p><strong>add_rendered_image_to_env_response</strong> – Some environments have a different observation compared to the one displayed while rendering.
 For some cases it can be useful to pass the rendered image to the agent for visualization purposes.
 If this flag is set to True, the rendered image will be added to the environment EnvResponse object,
-which will be passed to the agent and allow using those images.</li>
+which will be passed to the agent and allow using those images.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -246,29 +245,25 @@ which will be passed to the agent and allow using those images.</li>
 <dl class="class">
 <dt id="rl_coach.base_parameters.PresetValidationParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">PresetValidationParameters</code><span class="sig-paren">(</span><em>test=False</em>, <em>min_reward_threshold=0</em>, <em>max_episodes_to_achieve_reward=1</em>, <em>num_workers=1</em>, <em>reward_test_level=None</em>, <em>test_using_a_trace_test=True</em>, <em>trace_test_levels=None</em>, <em>trace_max_env_steps=5000</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#PresetValidationParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.PresetValidationParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>test</strong> – A flag which specifies if the preset should be tested as part of the validation process.</li>
-<li><strong>min_reward_threshold</strong> – The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
-preset is run.</li>
-<li><strong>max_episodes_to_achieve_reward</strong> – The maximum number of episodes that the agent should train using the preset in order to achieve the
-reward specified by min_reward_threshold.</li>
-<li><strong>num_workers</strong> – The number of workers that should be used when running this preset in the test suite for validation.</li>
-<li><strong>reward_test_level</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
-reward tests suite.</li>
-<li><strong>test_using_a_trace_test</strong> – A flag that specifies if the preset should be run as part of the trace tests suite.</li>
-<li><strong>trace_test_levels</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
-trace tests suite.</li>
-<li><strong>trace_max_env_steps</strong> – An integer representing the maximum number of environment steps to run when running this preset as part
-of the trace tests suite.</li>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>test</strong> – A flag which specifies if the preset should be tested as part of the validation process.</p></li>
+<li><p><strong>min_reward_threshold</strong> – The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
+preset is run.</p></li>
+<li><p><strong>max_episodes_to_achieve_reward</strong> – The maximum number of episodes that the agent should train using the preset in order to achieve the
+reward specified by min_reward_threshold.</p></li>
+<li><p><strong>num_workers</strong> – The number of workers that should be used when running this preset in the test suite for validation.</p></li>
+<li><p><strong>reward_test_level</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
+reward tests suite.</p></li>
+<li><p><strong>test_using_a_trace_test</strong> – A flag that specifies if the preset should be run as part of the trace tests suite.</p></li>
+<li><p><strong>trace_test_levels</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
+trace tests suite.</p></li>
+<li><p><strong>trace_max_env_steps</strong> – An integer representing the maximum number of environment steps to run when running this preset as part
+of the trace tests suite.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -277,30 +272,26 @@ of the trace tests suite.</li>
 <dl class="class">
 <dt id="rl_coach.base_parameters.TaskParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">TaskParameters</code><span class="sig-paren">(</span><em>framework_type: rl_coach.base_parameters.Frameworks = &lt;Frameworks.tensorflow: 'TensorFlow'&gt;</em>, <em>evaluate_only: int = None</em>, <em>use_cpu: bool = False</em>, <em>experiment_path='/tmp'</em>, <em>seed=None</em>, <em>checkpoint_save_secs=None</em>, <em>checkpoint_restore_dir=None</em>, <em>checkpoint_restore_path=None</em>, <em>checkpoint_save_dir=None</em>, <em>export_onnx_graph: bool = False</em>, <em>apply_stop_condition: bool = False</em>, <em>num_gpu: int = 1</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#TaskParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.TaskParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</li>
-<li><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
-A value of 0 means that task will be evaluated for an infinite number of steps.</li>
-<li><strong>use_cpu</strong> – use the cpu for this task</li>
-<li><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</li>
-<li><strong>seed</strong> – a seed to use for the random numbers generator</li>
-<li><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</li>
-<li><strong>checkpoint_restore_dir</strong> – [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
-the dir to restore the checkpoints from</li>
-<li><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</li>
-<li><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</li>
-<li><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</li>
-<li><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</li>
-<li><strong>num_gpu</strong> – number of GPUs to use</li>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</p></li>
+<li><p><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
+A value of 0 means that task will be evaluated for an infinite number of steps.</p></li>
+<li><p><strong>use_cpu</strong> – use the cpu for this task</p></li>
+<li><p><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</p></li>
+<li><p><strong>seed</strong> – a seed to use for the random numbers generator</p></li>
+<li><p><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</p></li>
+<li><p><strong>checkpoint_restore_dir</strong> – [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
+the dir to restore the checkpoints from</p></li>
+<li><p><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</p></li>
+<li><p><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</p></li>
+<li><p><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</p></li>
+<li><p><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</p></li>
+<li><p><strong>num_gpu</strong> – number of GPUs to use</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -309,35 +300,31 @@ the dir to restore the checkpoints from</li>
 <dl class="class">
 <dt id="rl_coach.base_parameters.DistributedTaskParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">DistributedTaskParameters</code><span class="sig-paren">(</span><em>framework_type: rl_coach.base_parameters.Frameworks</em>, <em>parameters_server_hosts: str</em>, <em>worker_hosts: str</em>, <em>job_type: str</em>, <em>task_index: int</em>, <em>evaluate_only: int = None</em>, <em>num_tasks: int = None</em>, <em>num_training_tasks: int = None</em>, <em>use_cpu: bool = False</em>, <em>experiment_path=None</em>, <em>dnd=None</em>, <em>shared_memory_scratchpad=None</em>, <em>seed=None</em>, <em>checkpoint_save_secs=None</em>, <em>checkpoint_restore_path=None</em>, <em>checkpoint_save_dir=None</em>, <em>export_onnx_graph: bool = False</em>, <em>apply_stop_condition: bool = False</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#DistributedTaskParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.DistributedTaskParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</li>
-<li><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
-A value of 0 means that task will be evaluated for an infinite number of steps.</li>
-<li><strong>parameters_server_hosts</strong> – comma-separated list of hostname:port pairs to which the parameter servers are
-assigned</li>
-<li><strong>worker_hosts</strong> – comma-separated list of hostname:port pairs to which the workers are assigned</li>
-<li><strong>job_type</strong> – the job type - either ps (short for parameters server) or worker</li>
-<li><strong>task_index</strong> – the index of the process</li>
-<li><strong>num_tasks</strong> – the number of total tasks that are running (not including the parameters server)</li>
-<li><strong>num_training_tasks</strong> – the number of tasks that are training (not including the parameters server)</li>
-<li><strong>use_cpu</strong> – use the cpu for this task</li>
-<li><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</li>
-<li><strong>dnd</strong> – an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.</li>
-<li><strong>seed</strong> – a seed to use for the random numbers generator</li>
-<li><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</li>
-<li><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</li>
-<li><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</li>
-<li><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</li>
-<li><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</li>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</p></li>
+<li><p><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
+A value of 0 means that task will be evaluated for an infinite number of steps.</p></li>
+<li><p><strong>parameters_server_hosts</strong> – comma-separated list of hostname:port pairs to which the parameter servers are
+assigned</p></li>
+<li><p><strong>worker_hosts</strong> – comma-separated list of hostname:port pairs to which the workers are assigned</p></li>
+<li><p><strong>job_type</strong> – the job type - either ps (short for parameters server) or worker</p></li>
+<li><p><strong>task_index</strong> – the index of the process</p></li>
+<li><p><strong>num_tasks</strong> – the number of total tasks that are running (not including the parameters server)</p></li>
+<li><p><strong>num_training_tasks</strong> – the number of tasks that are training (not including the parameters server)</p></li>
+<li><p><strong>use_cpu</strong> – use the cpu for this task</p></li>
+<li><p><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</p></li>
+<li><p><strong>dnd</strong> – an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.</p></li>
+<li><p><strong>seed</strong> – a seed to use for the random numbers generator</p></li>
+<li><p><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</p></li>
+<li><p><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</p></li>
+<li><p><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</p></li>
+<li><p><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</p></li>
+<li><p><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -352,7 +339,7 @@ assigned</li>
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
      
-        <a href="spaces.html" class="btn btn-neutral" title="Spaces" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="spaces.html" class="btn btn-neutral float-left" title="Spaces" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -361,7 +348,7 @@ assigned</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -378,27 +365,16 @@ assigned</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../_static/jquery.js"></script>
-        <script type="text/javascript" src="../_static/underscore.js"></script>
-        <script type="text/javascript" src="../_static/doctools.js"></script>
-        <script type="text/javascript" src="../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/imitation/bc.html
+++ b/docs/components/agents/imitation/bc.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Behavioral Cloning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Behavioral Cloning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="ACER" href="../policy_optimization/acer.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -230,9 +233,9 @@ These demonstrations are given as state, action tuples, and with no reward.
 The training goal is to reduce the difference between the actions predicted by the network and the actions taken by
 the expert for each state.</p>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Use the current states as input to the network, and the expert actions as the targets of the network.</li>
-<li>For the network head, we use the policy head, which uses the cross entropy loss function.</li>
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Use the current states as input to the network, and the expert actions as the targets of the network.</p></li>
+<li><p>For the network head, we use the policy head, which uses the cross entropy loss function.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.bc_agent.BCAlgorithmParameters">
@@ -254,7 +257,7 @@ the expert for each state.</p>
        <a href="../value_optimization/bs_dqn.html" class="btn btn-neutral float-right" title="Bootstrapped DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/acer.html" class="btn btn-neutral" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/acer.html" class="btn btn-neutral float-left" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -263,7 +266,7 @@ the expert for each state.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -280,27 +283,16 @@ the expert for each state.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/imitation/cil.html
+++ b/docs/components/agents/imitation/cil.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Conditional Imitation Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Conditional Imitation Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Categorical DQN" href="../value_optimization/categorical_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -233,25 +236,22 @@ the expert for each state.
 In conditional imitation learning, each transition is assigned a class, which determines the goal that was pursuit
 in that transitions. For example, 3 possible classes can be: turn right, turn left and follow lane.</p>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
-of transitions will be sampled from each class index.</li>
-<li>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
+<li><p>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
+of transitions will be sampled from each class index.</p></li>
+<li><p>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
 corresponding to the state classes. For the other heads, set the targets to match the currently predicted values,
-so that the loss for the other heads will be zeroed out.</li>
-<li>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</li>
+so that the loss for the other heads will be zeroed out.</p></li>
+<li><p>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.cil_agent.CILAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.cil_agent.</code><code class="descname">CILAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/cil_agent.html#CILAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.cil_agent.CILAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state_key_with_the_class_index</strong> – (str)
-The key of the state dictionary which corresponds to the value that will be used to control the class index.</td>
-</tr>
-</tbody>
-</table>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>state_key_with_the_class_index</strong> – (str)
+The key of the state dictionary which corresponds to the value that will be used to control the class index.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -269,7 +269,7 @@ The key of the state dictionary which corresponds to the value that will be used
        <a href="../policy_optimization/cppo.html" class="btn btn-neutral float-right" title="Clipped Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral float-left" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -278,7 +278,7 @@ The key of the state dictionary which corresponds to the value that will be used

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -295,27 +295,16 @@ The key of the state dictionary which corresponds to the value that will be used
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/index.html
+++ b/docs/components/agents/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Agents &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Agents &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Adding a New Environment" href="../../contributing/add_env.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -241,59 +244,50 @@ A detailed description of those algorithms can be found by navigating to each of
 <dl class="class">
 <dt id="rl_coach.base_parameters.AgentParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">AgentParameters</code><span class="sig-paren">(</span><em>algorithm: rl_coach.base_parameters.AlgorithmParameters, exploration: ExplorationParameters, memory: MemoryParameters, networks: Dict[str, rl_coach.base_parameters.NetworkParameters], visualization: rl_coach.base_parameters.VisualizationParameters = &lt;rl_coach.base_parameters.VisualizationParameters object&gt;</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/base_parameters.html#AgentParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.AgentParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
 The parameters used for the specific algorithm used by the agent.
-These parameters can be later referenced in the agent implementation through self.ap.algorithm.</li>
-<li><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
+These parameters can be later referenced in the agent implementation through self.ap.algorithm.</p></li>
+<li><p><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
 space types and their corresponding ExplorationParameters. If a dictionary was used,
 when the agent will be instantiated, the correct exploration policy parameters will be used
 according to the real type of the environment action space.
-These parameters will be used to instantiate the exporation policy.</li>
-<li><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</li>
-<li><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
+These parameters will be used to instantiate the exporation policy.</p></li>
+<li><p><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</p></li>
+<li><p><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
 as a class inheriting NetworkParameters. Each element will be used in order to instantiate
 a NetworkWrapper class, and all the network wrappers will be stored in the agent under
 self.network_wrappers. self.network_wrappers is a dict mapping between the network name that
-was given in the networks dict, and the instantiated network wrapper.</li>
-<li><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
-used for visualization purposes, such as printing to the screen, rendering, and saving videos.</li>
+was given in the networks dict, and the instantiated network wrapper.</p></li>
+<li><p><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
+used for visualization purposes, such as printing to the screen, rendering, and saving videos.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <dl class="class">
 <dt id="rl_coach.agents.agent.Agent">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.agent.</code><code class="descname">Agent</code><span class="sig-paren">(</span><em>agent_parameters: rl_coach.base_parameters.AgentParameters</em>, <em>parent: Union[LevelManager</em>, <em>CompositeAgent] = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</td>
-</tr>
-</tbody>
-</table>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</p>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.act">
 <code class="descname">act</code><span class="sig-paren">(</span><em>action: Union[None</em>, <em>int</em>, <em>float</em>, <em>numpy.ndarray</em>, <em>List] = None</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.ActionInfo<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.act"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.act" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given the agents current knowledge, decide on the next action to apply to the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – An action to take, overriding whatever the current policy is</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">An ActionInfo object, which contains the action and any additional info from the action decision process</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – An action to take, overriding whatever the current policy is</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>An ActionInfo object, which contains the action and any additional info from the action decision process</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -302,21 +296,17 @@ used for visualization purposes, such as printing to the screen, rendering, and
 <dd><p>This function is a wrapper to allow having the same calls for shared or unshared memories.
 It should be used instead of calling the memory directly in order to allow different algorithms to work
 both with a shared and a local memory.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>func</strong> – the name of the memory function to call</li>
-<li><strong>args</strong> – the arguments to supply to the function</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>func</strong> – the name of the memory function to call</p></li>
+<li><p><strong>args</strong> – the arguments to supply to the function</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the return value of the function</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the return value of the function</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -324,16 +314,14 @@ both with a shared and a local memory.</p>
 <code class="descname">choose_action</code><span class="sig-paren">(</span><em>curr_state</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.choose_action"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.choose_action" title="Permalink to this definition">¶</a></dt>
 <dd><p>choose an action to act with in the current episode being played. Different behavior might be exhibited when
 training or testing.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>curr_state</strong> – the current state to act upon.</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">chosen action, some action value describing the action (q-value, probability, etc)</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>curr_state</strong> – the current state to act upon.</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>chosen action, some action value describing the action (q-value, probability, etc)</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -351,14 +339,11 @@ training or testing.</p>
 <dd><p>Create all the networks of the agent.
 The network creation will be done after setting the environment parameters for the agent, since they are needed
 for creating the network.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A list containing all the networks</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A list containing all the networks</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -367,37 +352,31 @@ for creating the network.</p>
 <dd><p>Get a prediction from the agent with regard to the requested prediction_type.
 If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
 raise a ValueException.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>states</strong> – The states to get a prediction for</li>
-<li><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>states</strong> – The states to get a prediction for</p></li>
+<li><p><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the predicted values</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the predicted values</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.get_state_embedding">
 <code class="descname">get_state_embedding</code><span class="sig-paren">(</span><em>state: dict</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.get_state_embedding"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.get_state_embedding" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a state, get the corresponding state embedding  from the main network</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – a state dict</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy embedding vector</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>state</strong> – a state dict</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy embedding vector</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -406,14 +385,11 @@ raise a ValueException.</p>
 <dd><p>Make any changes needed when each episode is ended.
 This includes incrementing counters, updating full episode dependent values, updating logs, etc.
 This function is called right after each episode is ended.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -421,44 +397,36 @@ This function is called right after each episode is ended.</p>
 <code class="descname">init_environment_dependent_modules</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.init_environment_dependent_modules"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.init_environment_dependent_modules" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initialize any modules that depend on knowing information about the environment such as the action space or
 the observation space</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.learn_from_batch">
 <code class="descname">learn_from_batch</code><span class="sig-paren">(</span><em>batch</em><span class="sig-paren">)</span> &#x2192; Tuple[float, List, List]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.learn_from_batch"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.learn_from_batch" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a batch of transitions, calculates their target values and updates the network.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>batch</strong> – A list of transitions</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The total loss of the training, the loss per head and the unclipped gradients</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>batch</strong> – A list of transitions</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>The total loss of the training, the loss per head and the unclipped gradients</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.log_to_screen">
 <code class="descname">log_to_screen</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.log_to_screen"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.log_to_screen" title="Permalink to this definition">¶</a></dt>
 <dd><p>Write an episode summary line to the terminal</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -467,59 +435,48 @@ the observation space</p>
 <dd><p>Given a response from the environment, distill the observation from it and store it for later use.
 The response should be a dictionary containing the performed action, the new observation and measurements,
 the reward, a game over flag and any additional information necessary.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>env_response</strong> – result of call from environment.step(action)</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a boolean value which determines if the agent has decided to terminate the episode after seeing the
-given observation</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>env_response</strong> – result of call from environment.step(action)</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a boolean value which determines if the agent has decided to terminate the episode after seeing the
+given observation</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.agents.agent.Agent.parent">
 <code class="descname">parent</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.parent" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the parent class of the agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the current phase</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>the current phase</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.agents.agent.Agent.phase">
 <code class="descname">phase</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.phase" title="Permalink to this definition">¶</a></dt>
 <dd><p>The current running phase of the agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">RunPhase</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>RunPhase</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.post_training_commands">
 <code class="descname">post_training_commands</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.post_training_commands"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.post_training_commands" title="Permalink to this definition">¶</a></dt>
 <dd><p>A function which allows adding any functionality that is required to run right after the training phase ends.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -527,45 +484,37 @@ given observation</td>
 <code class="descname">prepare_batch_for_inference</code><span class="sig-paren">(</span><em>states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str</em><span class="sig-paren">)</span> &#x2192; Dict[str, numpy.array]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.prepare_batch_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.prepare_batch_for_inference" title="Permalink to this definition">¶</a></dt>
 <dd><p>Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
 observations together, measurements together, etc.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
-corresponding observation</li>
-<li><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
-the observation relevant for the network from the states.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
+corresponding observation</p></li>
+<li><p><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
+the observation relevant for the network from the states.</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dictionary containing a list of values from all the given states for each of the observations</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>A dictionary containing a list of values from all the given states for each of the observations</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.register_signal">
 <code class="descname">register_signal</code><span class="sig-paren">(</span><em>signal_name: str</em>, <em>dump_one_value_per_episode: bool = True</em>, <em>dump_one_value_per_step: bool = False</em><span class="sig-paren">)</span> &#x2192; rl_coach.utils.Signal<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.register_signal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.register_signal" title="Permalink to this definition">¶</a></dt>
 <dd><p>Register a signal such that its statistics will be dumped and be viewable through dashboard</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</li>
-<li><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</li>
-<li><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</p></li>
+<li><p><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</p></li>
+<li><p><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the created signal</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the created signal</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -574,46 +523,39 @@ the observation relevant for the network from the states.</li>
 <dd><p>Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
 evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
 by val, and by the current phase set in self.phase.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – The new phase to change to</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>val</strong> – The new phase to change to</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.reset_internal_state">
 <code class="descname">reset_internal_state</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.reset_internal_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.reset_internal_state" title="Permalink to this definition">¶</a></dt>
 <dd><p>Reset all the episodic parameters. This function is called right before each episode starts.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.restore_checkpoint">
 <code class="descname">restore_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_dir: str</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.restore_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.restore_checkpoint" title="Permalink to this definition">¶</a></dt>
 <dd><p>Allows agents to store additional information when saving checkpoints.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -621,51 +563,42 @@ by val, and by the current phase set in self.phase.</p>
 <code class="descname">run_off_policy_evaluation</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="headerlink" href="#rl_coach.agents.agent.Agent.run_off_policy_evaluation" title="Permalink to this definition">¶</a></dt>
 <dd><p>Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
 Should only be implemented for off-policy RL algorithms.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference">
 <code class="descname">run_pre_network_filter_for_inference</code><span class="sig-paren">(</span><em>state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True</em><span class="sig-paren">)</span> &#x2192; Dict[str, numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.run_pre_network_filter_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference" title="Permalink to this definition">¶</a></dt>
 <dd><p>Run filters which where defined for being applied right before using the state for inference.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>state</strong> – The state to run the filters on</li>
-<li><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>state</strong> – The state to run the filters on</p></li>
+<li><p><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">The filtered state</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>The filtered state</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.save_checkpoint">
 <code class="descname">save_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_prefix: str</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.save_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.save_checkpoint" title="Permalink to this definition">¶</a></dt>
 <dd><p>Allows agents to store additional information when saving checkpoints.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -673,16 +606,14 @@ Should only be implemented for off-policy RL algorithms.</p>
 <code class="descname">set_environment_parameters</code><span class="sig-paren">(</span><em>spaces: rl_coach.spaces.SpacesDefinition</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_environment_parameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_environment_parameters" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
 dependent on those values, by calling init_environment_dependent_modules</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>spaces</strong> – the environment spaces definition</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>spaces</strong> – the environment spaces definition</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -692,58 +623,47 @@ dependent on those values, by calling init_environment_dependent_modules</p>
 has another master agent that is controlling it. In such cases, the master agent can define the goals for the
 slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent
 in-action-space.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – The action that should be set as the directive</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – The action that should be set as the directive</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p></p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.set_session">
 <code class="descname">set_session</code><span class="sig-paren">(</span><em>sess</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_session" title="Permalink to this definition">¶</a></dt>
 <dd><p>Set the deep learning framework session for all the agents in the composite agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.setup_logger">
 <code class="descname">setup_logger</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.setup_logger"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.setup_logger" title="Permalink to this definition">¶</a></dt>
 <dd><p>Setup the logger for the agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.sync">
 <code class="descname">sync</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.sync"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.sync" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sync the global network parameters to local networks</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -752,14 +672,11 @@ in-action-space.</p>
 <dd><p>Check if a training phase should be done as configured by num_consecutive_playing_steps.
 If it should, then do several training steps as configured by num_consecutive_training_steps.
 A single training iteration: Sample a batch, train on it and update target networks.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The total training loss during the training iterations.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The total training loss during the training iterations.</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -768,28 +685,22 @@ A single training iteration: Sample a batch, train on it and update target netwo
 <dd><p>Updates the episodic log file with all the signal values from the most recent episode.
 Additional signals for logging can be set by the creating a new signal using self.register_signal,
 and then updating it with some internal agent values.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.update_step_in_episode_log">
 <code class="descname">update_step_in_episode_log</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_step_in_episode_log"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_step_in_episode_log" title="Permalink to this definition">¶</a></dt>
 <dd><p>Updates the in-episode log file with all the signal values from the most recent step.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -797,16 +708,14 @@ and then updating it with some internal agent values.</p>
 <code class="descname">update_transition_before_adding_to_replay_buffer</code><span class="sig-paren">(</span><em>transition: rl_coach.core_types.Transition</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.Transition<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_transition_before_adding_to_replay_buffer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_transition_before_adding_to_replay_buffer" title="Permalink to this definition">¶</a></dt>
 <dd><p>Allows agents to update the transition just before adding it to the replay buffer.
 Can be useful for agents that want to tweak the reward, termination signal, etc.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition</strong> – the transition to update</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the updated transition</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>transition</strong> – the transition to update</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the updated transition</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -824,7 +733,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
        <a href="policy_optimization/ac.html" class="btn btn-neutral float-right" title="Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../../contributing/add_env.html" class="btn btn-neutral" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../../contributing/add_env.html" class="btn btn-neutral float-left" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -833,7 +742,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -850,27 +759,16 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/other/dfp.html
+++ b/docs/components/agents/other/dfp.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Direct Future Prediction &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Direct Future Prediction &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Soft Actor-Critic" href="../policy_optimization/sac.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -228,13 +231,13 @@
 <div class="section" id="choosing-an-action">
 <h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
+<li><p>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
 The output of the network is the predicted future measurements for time-steps <span class="math notranslate nohighlight">\(t+1,t+2,t+4,t+8,t+16\)</span> and
-<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</li>
-<li>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
-and the result is a single vector of future values for each action.</li>
-<li>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</li>
-<li>The action values are passed to the exploration policy to decide on the action to use.</li>
+<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</p></li>
+<li><p>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
+and the result is a single vector of future values for each action.</p></li>
+<li><p>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</p></li>
+<li><p>The action values are passed to the exploration policy to decide on the action to use.</p></li>
 </ol>
 </div>
 <div class="section" id="training-the-network">
@@ -247,39 +250,35 @@ For the actions that were not taken, the targets are the current values.</p>
 <dl class="class">
 <dt id="rl_coach.agents.dfp_agent.DFPAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.dfp_agent.</code><code class="descname">DFPAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/dfp_agent.html#DFPAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.dfp_agent.DFPAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_predicted_steps_ahead</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_predicted_steps_ahead</strong> – (int)
 Number of future steps to predict measurements for. The future steps won’t be sequential, but rather jump
 in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4.
-The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</li>
-<li><strong>goal_vector</strong> – (List[float])
+The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</p></li>
+<li><p><strong>goal_vector</strong> – (List[float])
 The goal vector will weight each of the measurements to form an optimization goal. The vector should have
 the same length as the number of measurements, and it will be vector multiplied by the measurements.
 Positive values correspond to trying to maximize the particular measurement, and negative values
-correspond to trying to minimize the particular measurement.</li>
-<li><strong>future_measurements_weights</strong> – (List[float])
+correspond to trying to minimize the particular measurement.</p></li>
+<li><p><strong>future_measurements_weights</strong> – (List[float])
 The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
 goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
 then only the 3 last timesteps will be taken into account, according to the weights in the
-future_measurements_weights vector.</li>
-<li><strong>use_accumulated_reward_as_measurement</strong> – (bool)
+future_measurements_weights vector.</p></li>
+<li><p><strong>use_accumulated_reward_as_measurement</strong> – (bool)
 If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
 the measurements vector in the state. This van be useful in environments where the given measurements don’t
-include enough information for the particular goal the agent should achieve.</li>
-<li><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
-Dictates how to handle measurements that are outside the episode length.</li>
-<li><strong>scale_measurements_targets</strong> – (Dict[str, float])
+include enough information for the particular goal the agent should achieve.</p></li>
+<li><p><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
+Dictates how to handle measurements that are outside the episode length.</p></li>
+<li><p><strong>scale_measurements_targets</strong> – (Dict[str, float])
 Allows rescaling the values of each of the measurements available. This van be useful when the measurements
-have a different scale and you want to normalize them to the same scale.</li>
+have a different scale and you want to normalize them to the same scale.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -297,7 +296,7 @@ have a different scale and you want to normalize them to the same scale.</li>
        <a href="../value_optimization/double_dqn.html" class="btn btn-neutral float-right" title="Double DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/sac.html" class="btn btn-neutral" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/sac.html" class="btn btn-neutral float-left" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -306,7 +305,7 @@ have a different scale and you want to normalize them to the same scale.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -323,27 +322,16 @@ have a different scale and you want to normalize them to the same scale.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/ac.html
+++ b/docs/components/agents/policy_optimization/ac.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Actor-Critic &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Actor-Critic &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Agents" href="../index.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -235,41 +238,37 @@ distribution assigned with these probabilities. When testing, the action with th
 <p>A batch of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions is used, and the advantages are calculated upon it.</p>
 <p>Advantages can be calculated by either of the following methods (configured by the selected preset) -</p>
 <ol class="arabic simple">
-<li><strong>A_VALUE</strong> - Estimating advantage directly:
+<li><p><strong>A_VALUE</strong> - Estimating advantage directly:
 <span class="math notranslate nohighlight">\(A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)\)</span>
-where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</li>
-<li><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</li>
+where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</p></li>
+<li><p><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</p></li>
 </ol>
 <p>The advantages are then used in order to accumulate gradients according to
 <span class="math notranslate nohighlight">\(L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]\)</span></p>
 <dl class="class">
 <dt id="rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.actor_critic_agent.</code><code class="descname">ActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/actor_critic_agent.html#ActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
-The value that will be used to rescale the policy gradient</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+The value that will be used to rescale the policy gradient</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes to wait before applying the accumulated gradients to the network.
-The training iterations only accumulate gradients without actually applying them.</li>
-<li><strong>beta_entropy</strong> – (float)
-The weight that will be given to the entropy regularization which is used in order to improve exploration.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+The training iterations only accumulate gradients without actually applying them.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
+The weight that will be given to the entropy regularization which is used in order to improve exploration.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
-accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
-<li><strong>gae_lambda</strong> – (float)
+accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
+<li><p><strong>gae_lambda</strong> – (float)
 If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
-scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</li>
-<li><strong>estimate_state_value_using_gae</strong> – (bool)
-If set to True, the state value targets for the V head will be estimated using the GAE scheme.</li>
+scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</p></li>
+<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
+If set to True, the state value targets for the V head will be estimated using the GAE scheme.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -287,7 +286,7 @@ If set to True, the state value targets for the V head will be estimated using t
        <a href="acer.html" class="btn btn-neutral float-right" title="ACER" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../index.html" class="btn btn-neutral" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../index.html" class="btn btn-neutral float-left" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -296,7 +295,7 @@ If set to True, the state value targets for the V head will be estimated using t

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -313,27 +312,16 @@ If set to True, the state value targets for the V head will be estimated using t
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/acer.html
+++ b/docs/components/agents/policy_optimization/acer.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>ACER &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>ACER &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Actor-Critic" href="ac.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -236,11 +239,11 @@ distribution assigned with these probabilities. When testing, the action with th
 and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-policy updates from batches of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions sampled from the replay buffer.</p>
 <p>Each update perform the following procedure:</p>
 <ol class="arabic">
-<li><p class="first"><strong>Calculate state values:</strong></p>
+<li><p><strong>Calculate state values:</strong></p>
 <div class="math notranslate nohighlight">
 \[V(s_t) = \mathbb{E}_{a \sim \pi} [Q(s_t,a)]\]</div>
 </li>
-<li><p class="first"><strong>Calculate Q retrace:</strong></p>
+<li><p><strong>Calculate Q retrace:</strong></p>
 <blockquote>
 <div><div class="math notranslate nohighlight">
 \[Q^{ret}(s_t,a_t) = r_t +\gamma \bar{\rho}_{t+1}[Q^{ret}(s_{t+1},a_{t+1}) - Q(s_{t+1},a_{t+1})] + \gamma V(s_{t+1})\]</div>
@@ -248,7 +251,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
 \[\text{where} \quad \bar{\rho}_{t} = \min{\left\{c,\rho_t\right\}},\quad \rho_t=\frac{\pi (a_t \mid s_t)}{\mu (a_t \mid s_t)}\]</div>
 </div></blockquote>
 </li>
-<li><p class="first"><strong>Accumulate gradients:</strong></p>
+<li><p><strong>Accumulate gradients:</strong></p>
 <blockquote>
 <div><p><span class="math notranslate nohighlight">\(\bullet\)</span> <strong>Policy gradients (with bias correction):</strong></p>
 <blockquote>
@@ -263,7 +266,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
 </div></blockquote>
 </div></blockquote>
 </li>
-<li><p class="first"><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
+<li><p><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
 <blockquote>
 <div><div class="math notranslate nohighlight">
 \[\hat{g}_t^{trust-region} = \hat{g}_t^{policy} - \max \left\{0, \frac{k^T \hat{g}_t^{policy} - \delta}{\lVert k \rVert_2^2}\right\} k\]</div>
@@ -277,39 +280,35 @@ The goal of the trust region update is to the difference between the updated pol
 <dl class="class">
 <dt id="rl_coach.agents.acer_agent.ACERAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.acer_agent.</code><code class="descname">ACERAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/acer_agent.html#ACERAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.acer_agent.ACERAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
-accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
-<li><strong>ratio_of_replay</strong> – (int)
-The number of off-policy training iterations in each ACER iteration.</li>
-<li><strong>num_transitions_to_start_replay</strong> – (int)
+accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
+<li><p><strong>ratio_of_replay</strong> – (int)
+The number of off-policy training iterations in each ACER iteration.</p></li>
+<li><p><strong>num_transitions_to_start_replay</strong> – (int)
 Number of environment steps until ACER starts to train off-policy from the experience replay.
 This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
-the experience replay to start the off-policy training.</li>
-<li><strong>rate_for_copying_weights_to_target</strong> – (float)
+the experience replay to start the off-policy training.</p></li>
+<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
 The rate of the exponential moving average for the average policy which is used for the trust region optimization.
-The target network in this algorithm is used as the average policy.</li>
-<li><strong>importance_weight_truncation</strong> – (float)
-The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</li>
-<li><strong>use_trust_region_optimization</strong> – (bool)
+The target network in this algorithm is used as the average policy.</p></li>
+<li><p><strong>importance_weight_truncation</strong> – (float)
+The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</p></li>
+<li><p><strong>use_trust_region_optimization</strong> – (bool)
 If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
-the average policy and the current one, to bound the change of the policy during the network update.</li>
-<li><strong>max_KL_divergence</strong> – (float)
+the average policy and the current one, to bound the change of the policy during the network update.</p></li>
+<li><p><strong>max_KL_divergence</strong> – (float)
 The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
-for this parameter to have an effect.</li>
-<li><strong>beta_entropy</strong> – (float)
+for this parameter to have an effect.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 An entropy regulaization term can be added to the loss function in order to control exploration. This term
-is weighted using the beta value defined by beta_entropy.</li>
+is weighted using the beta value defined by beta_entropy.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -327,7 +326,7 @@ is weighted using the beta value defined by beta_entropy.</li>
        <a href="../imitation/bc.html" class="btn btn-neutral float-right" title="Behavioral Cloning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="ac.html" class="btn btn-neutral" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="ac.html" class="btn btn-neutral float-left" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -336,7 +335,7 @@ is weighted using the beta value defined by beta_entropy.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -353,27 +352,16 @@ is weighted using the beta value defined by beta_entropy.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/cppo.html
+++ b/docs/components/agents/policy_optimization/cppo.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Clipped Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Clipped Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Conditional Imitation Learning" href="../imitation/cil.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -233,17 +236,14 @@
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>Very similar to PPO, with several small (but very simplifying) changes:</p>
 <ol class="arabic">
-<li><p class="first">Train both the value and policy networks, simultaneously, by defining a single loss function,
-which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p>
-</li>
-<li><p class="first">The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p>
-</li>
-<li><p class="first">Value targets are now also calculated based on the GAE advantages.
+<li><p>Train both the value and policy networks, simultaneously, by defining a single loss function,
+which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p></li>
+<li><p>The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p></li>
+<li><p>Value targets are now also calculated based on the GAE advantages.
 In this method, the <span class="math notranslate nohighlight">\(V\)</span> values are predicted from the critic network, and then added to the GAE based advantages,
 in order to get a <span class="math notranslate nohighlight">\(Q\)</span> value for each action. Now, since our critic network is predicting a <span class="math notranslate nohighlight">\(V\)</span> value for
-each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p>
-</li>
-<li><p class="first">Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
+each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p></li>
+<li><p>Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
 <span class="math notranslate nohighlight">\(r_t(\theta) =\frac{\pi_{\theta}(a|s)}{\pi_{\theta_{old}}(a|s)}\)</span> is clipped, to achieve a similar effect.
 This is done by defining the policy’s loss function to be the minimum between the standard surrogate loss and an epsilon
 clipped surrogate loss:</p>
@@ -253,46 +253,42 @@ clipped surrogate loss:</p>
 <dl class="class">
 <dt id="rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.clipped_ppo_agent.</code><code class="descname">ClippedPPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/clipped_ppo_agent.html#ClippedPPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
 This represents how the critic will be used to update the actor. The critic value function is typically used
 to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
-advantage of the action, or the generalized advantage estimation (GAE) value.</li>
-<li><strong>gae_lambda</strong> – (float)
+advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
+<li><p><strong>gae_lambda</strong> – (float)
 The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
 estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
-n-step estimations.</li>
-<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
+n-step estimations.</p></li>
+<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
 If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
 clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
 This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
-implementations.</li>
-<li><strong>value_targets_mix_fraction</strong> – (float)
+implementations.</p></li>
+<li><p><strong>value_targets_mix_fraction</strong> – (float)
 The targets for the value network are an exponential weighted moving average which uses this mix fraction to
 define how much of the new targets will be taken into account when calculating the loss.
-This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
-<li><strong>estimate_state_value_using_gae</strong> – (bool)
-If set to True, the state value will be estimated using the GAE technique.</li>
-<li><strong>use_kl_regularization</strong> – (bool)
+This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
+<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
+If set to True, the state value will be estimated using the GAE technique.</p></li>
+<li><p><strong>use_kl_regularization</strong> – (bool)
 If set to True, the loss function will be regularized using the KL diveregence between the current and new
-policy, to bound the change of the policy during the network update.</li>
-<li><strong>beta_entropy</strong> – (float)
+policy, to bound the change of the policy during the network update.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 An entropy regulaization term can be added to the loss function in order to control exploration. This term
-is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
-<li><strong>optimization_epochs</strong> – (int)
+is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
+<li><p><strong>optimization_epochs</strong> – (int)
 For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
-optimization_epochs value.</li>
-<li><strong>optimization_epochs</strong> – (Schedule)
-Can be used to define a schedule over the clipping of the likelihood ratio.</li>
+optimization_epochs value.</p></li>
+<li><p><strong>optimization_epochs</strong> – (Schedule)
+Can be used to define a schedule over the clipping of the likelihood ratio.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -310,7 +306,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
        <a href="ddpg.html" class="btn btn-neutral float-right" title="Deep Deterministic Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../imitation/cil.html" class="btn btn-neutral" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../imitation/cil.html" class="btn btn-neutral float-left" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -319,7 +315,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -336,27 +332,16 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/ddpg.html
+++ b/docs/components/agents/policy_optimization/ddpg.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Deep Deterministic Policy Gradient &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Deep Deterministic Policy Gradient &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Clipped Proximal Policy Optimization" href="cppo.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -235,14 +238,14 @@ to add exploration noise to the action. When testing, use the mean vector <span
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>Start by sampling a batch of transitions from the experience replay.</p>
 <ul>
-<li><p class="first">To train the <strong>critic network</strong>, use the following targets:</p>
+<li><p>To train the <strong>critic network</strong>, use the following targets:</p>
 <p><span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))\)</span></p>
 <p>First run the actor target network, using the next states as the inputs, and get <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>.
 Next, run the critic target network using the next states and <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>, and use the output to
 calculate <span class="math notranslate nohighlight">\(y_t\)</span> according to the equation above. To train the network, use the current states and actions
 as the inputs, and <span class="math notranslate nohighlight">\(y_t\)</span> as the targets.</p>
 </li>
-<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
+<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
 <p><span class="math notranslate nohighlight">\(\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]\)</span></p>
 <p>Use the actor’s online network to get the action mean values using the current states as the inputs.
 Then, use the critic online network in order to get the gradients of the critic output with respect to the
@@ -255,35 +258,31 @@ given <span class="math notranslate nohighlight">\(\nabla_a Q(s,a)\)</span>. Fin
 <dl class="class">
 <dt id="rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.ddpg_agent.</code><code class="descname">DDPGAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ddpg_agent.html#DDPGAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>rate_for_copying_weights_to_target</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
 When copying the online network weights to the target network weights, a soft update will be used, which
-weight the new online network weights by rate_for_copying_weights_to_target</li>
-<li><strong>num_consecutive_playing_steps</strong> – (StepMethod)
-The number of consecutive steps to act between every two training iterations</li>
-<li><strong>use_target_network_for_evaluation</strong> – (bool)
+weight the new online network weights by rate_for_copying_weights_to_target</p></li>
+<li><p><strong>num_consecutive_playing_steps</strong> – (StepMethod)
+The number of consecutive steps to act between every two training iterations</p></li>
+<li><p><strong>use_target_network_for_evaluation</strong> – (bool)
 If set to True, the target network will be used for predicting the actions when choosing actions to act.
-Since the target network weights change more slowly, the predicted actions will be more consistent.</li>
-<li><strong>action_penalty</strong> – (float)
+Since the target network weights change more slowly, the predicted actions will be more consistent.</p></li>
+<li><p><strong>action_penalty</strong> – (float)
 The amount by which to penalize the network on high action feature (pre-activation) values.
 This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
-gradients from becoming very low.</li>
-<li><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
-The range to clip the critic target to in order to prevent overestimation of the action values.</li>
-<li><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
+gradients from becoming very low.</p></li>
+<li><p><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
+The range to clip the critic target to in order to prevent overestimation of the action values.</p></li>
+<li><p><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
 If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
-values. If set to False, the terminal states reward will be taken as the target return for the network.</li>
+values. If set to False, the terminal states reward will be taken as the target return for the network.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -301,7 +300,7 @@ values. If set to False, the terminal states reward will be taken as the target
        <a href="sac.html" class="btn btn-neutral float-right" title="Soft Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="cppo.html" class="btn btn-neutral" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="cppo.html" class="btn btn-neutral float-left" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -310,7 +309,7 @@ values. If set to False, the terminal states reward will be taken as the target

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -327,27 +326,16 @@ values. If set to False, the terminal states reward will be taken as the target
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/hac.html
+++ b/docs/components/agents/policy_optimization/hac.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Hierarchical Actor Critic &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Hierarchical Actor Critic &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -31,21 +39,16 @@
    <link rel="search" title="Search" href="../../../search.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -212,7 +215,7 @@ to add exploration noise to the action. When testing, use the mean vector <span

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -229,27 +232,16 @@ to add exploration noise to the action. When testing, use the mean vector <span
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/pg.html
+++ b/docs/components/agents/policy_optimization/pg.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Policy Gradient &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Policy Gradient &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Persistent Advantage Learning" href="../value_optimization/pal.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -237,11 +240,11 @@ The <code class="code docutils literal notranslate"><span class="pre">PolicyGrad
 This is done in order to reduce the variance of the updates, since noisy gradient updates might destabilize the policy’s
 convergence. The rescaler is a configurable parameter and there are few options to choose from:</p>
 <ul class="simple">
-<li><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</li>
-<li><strong>Future Return</strong> - Return from each transition until the end of the episode.</li>
-<li><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</li>
-<li><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
-which are calculated seperately for each timestep, across different episodes.</li>
+<li><p><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</p></li>
+<li><p><strong>Future Return</strong> - Return from each transition until the end of the episode.</p></li>
+<li><p><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</p></li>
+<li><p><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
+which are calculated seperately for each timestep, across different episodes.</p></li>
 </ul>
 <p>Gradients are accumulated over a number of full played episodes. The gradients accumulation over several episodes
 serves the same purpose - reducing the update variance. After accumulating gradients for several episodes,
@@ -249,32 +252,28 @@ the gradients are then applied to the network.</p>
 <dl class="class">
 <dt id="rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.policy_gradients_agent.</code><code class="descname">PolicyGradientAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/policy_gradients_agent.html#PolicyGradientAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
 The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
 the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
-return, but there are other rescalers that are intended for reducing the variance of the updates.</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+return, but there are other rescalers that are intended for reducing the variance of the updates.</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes between applying the accumulated gradients to the network. After every
 num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
 it will then accumulate it in internal accumulators, and will only apply them to the network once in every
-apply_gradients_every_x_episodes episodes.</li>
-<li><strong>beta_entropy</strong> – (float)
+apply_gradients_every_x_episodes episodes.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
-will be added to the loss and scaled by the given beta factor.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+will be added to the loss and scaled by the given beta factor.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
 called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
-are used in the batch.</li>
+are used in the batch.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -292,7 +291,7 @@ are used in the batch.</li>
        <a href="ppo.html" class="btn btn-neutral float-right" title="Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../value_optimization/pal.html" class="btn btn-neutral" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../value_optimization/pal.html" class="btn btn-neutral float-left" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -301,7 +300,7 @@ are used in the batch.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -318,27 +317,16 @@ are used in the batch.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/ppo.html
+++ b/docs/components/agents/policy_optimization/ppo.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Policy Gradient" href="pg.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -234,66 +237,62 @@ When testing, just take the mean values predicted by the network.</p>
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</li>
-<li>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</li>
-<li>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
+<li><p>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</p></li>
+<li><p>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</p></li>
+<li><p>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
 the L-BFGS optimizer runs on the entire dataset at once, without batching.
 It continues running until some low loss threshold is reached. To prevent overfitting to the current dataset,
 the value targets are updated in a soft manner, using an Exponentially Weighted Moving Average, based on the total
-discounted returns of each state in each episode.</li>
-<li>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
+discounted returns of each state in each episode.</p></li>
+<li><p>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
 targets. The loss function penalizes policies that deviate too far from the old policy (the policy that was used <em>before</em>
-starting to run the current set of training iterations) using a regularization term.</li>
-<li>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
+starting to run the current set of training iterations) using a regularization term.</p></li>
+<li><p>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
 in order to adapt the penalty coefficient used in the policy loss. If the KL divergence went too high,
-increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</li>
+increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.ppo_agent.PPOAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.ppo_agent.</code><code class="descname">PPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ppo_agent.html#PPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ppo_agent.PPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
 This represents how the critic will be used to update the actor. The critic value function is typically used
 to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
-advantage of the action, or the generalized advantage estimation (GAE) value.</li>
-<li><strong>gae_lambda</strong> – (float)
+advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
+<li><p><strong>gae_lambda</strong> – (float)
 The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
 estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
-n-step estimations.</li>
-<li><strong>target_kl_divergence</strong> – (float)
+n-step estimations.</p></li>
+<li><p><strong>target_kl_divergence</strong> – (float)
 The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
-bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</li>
-<li><strong>initial_kl_coefficient</strong> – (float)
+bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</p></li>
+<li><p><strong>initial_kl_coefficient</strong> – (float)
 The initial weight that will be given to the KL divergence between the current and the new policy in the
-regularization factor.</li>
-<li><strong>high_kl_penalty_coefficient</strong> – (float)
-The penalty that will be given for KL divergence values which are highes than what was defined as the target.</li>
-<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
+regularization factor.</p></li>
+<li><p><strong>high_kl_penalty_coefficient</strong> – (float)
+The penalty that will be given for KL divergence values which are highes than what was defined as the target.</p></li>
+<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
 If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
 clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
 This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
-implementations.</li>
-<li><strong>value_targets_mix_fraction</strong> – (float)
+implementations.</p></li>
+<li><p><strong>value_targets_mix_fraction</strong> – (float)
 The targets for the value network are an exponential weighted moving average which uses this mix fraction to
 define how much of the new targets will be taken into account when calculating the loss.
-This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
-<li><strong>estimate_state_value_using_gae</strong> – (bool)
-If set to True, the state value will be estimated using the GAE technique.</li>
-<li><strong>use_kl_regularization</strong> – (bool)
+This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
+<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
+If set to True, the state value will be estimated using the GAE technique.</p></li>
+<li><p><strong>use_kl_regularization</strong> – (bool)
 If set to True, the loss function will be regularized using the KL diveregence between the current and new
-policy, to bound the change of the policy during the network update.</li>
-<li><strong>beta_entropy</strong> – (float)
+policy, to bound the change of the policy during the network update.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 An entropy regulaization term can be added to the loss function in order to control exploration. This term
-is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
+is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -311,7 +310,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
        <a href="../value_optimization/rainbow.html" class="btn btn-neutral float-right" title="Rainbow" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="pg.html" class="btn btn-neutral" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="pg.html" class="btn btn-neutral float-left" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -320,7 +319,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -337,27 +336,16 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/sac.html
+++ b/docs/components/agents/policy_optimization/sac.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Soft Actor-Critic &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Soft Actor-Critic &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Deep Deterministic Policy Gradient" href="ddpg.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -235,19 +238,19 @@ by picking the mean value or sample from a gaussian distribution like in trainin
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>Start by sampling a batch <span class="math notranslate nohighlight">\(B\)</span> of transitions from the experience replay.</p>
 <ul>
-<li><p class="first">To train the <strong>Q network</strong>, use the following targets:</p>
+<li><p>To train the <strong>Q network</strong>, use the following targets:</p>
 <div class="math notranslate nohighlight">
 \[y_t^Q=r(s_t,a_t)+\gamma \cdot V(s_{t+1})\]</div>
 <p>The state value used in the above target is acquired by running the target state value network.</p>
 </li>
-<li><p class="first">To train the <strong>State Value network</strong>, use the following targets:</p>
+<li><p>To train the <strong>State Value network</strong>, use the following targets:</p>
 <div class="math notranslate nohighlight">
 \[y_t^V = \min_{i=1,2}Q_i(s_t,\tilde{a}) - log\pi (\tilde{a} \vert s),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
 <p>The state value network is trained using a sample-based approximation of the connection between and state value and state
 action values, The actions used for constructing the target are <strong>not</strong> sampled from the replay buffer, but rather sampled
 from the current policy.</p>
 </li>
-<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
+<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
 <div class="math notranslate nohighlight">
 \[\nabla_{\theta} J \approx \nabla_{\theta} \frac{1}{\vert B \vert} \sum_{s_t\in B} \left( Q \left(s_t, \tilde{a}_\theta(s_t)\right) - log\pi_{\theta}(\tilde{a}_{\theta}(s_t)\vert s_t) \right),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
 </li>
@@ -256,24 +259,20 @@ from the current policy.</p>
 <dl class="class">
 <dt id="rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.soft_actor_critic_agent.</code><code class="descname">SoftActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/soft_actor_critic_agent.html#SoftActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>rate_for_copying_weights_to_target</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
 When copying the online network weights to the target network weights, a soft update will be used, which
-weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</li>
-<li><strong>use_deterministic_for_evaluation</strong> – (bool)
+weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</p></li>
+<li><p><strong>use_deterministic_for_evaluation</strong> – (bool)
 If True, during the evaluation phase, action are chosen deterministically according to the policy mean
-and not sampled from the policy distribution.</li>
+and not sampled from the policy distribution.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -291,7 +290,7 @@ and not sampled from the policy distribution.</li>
        <a href="../other/dfp.html" class="btn btn-neutral float-right" title="Direct Future Prediction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="ddpg.html" class="btn btn-neutral" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="ddpg.html" class="btn btn-neutral float-left" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -300,7 +299,7 @@ and not sampled from the policy distribution.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -317,27 +316,16 @@ and not sampled from the policy distribution.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/bs_dqn.html
+++ b/docs/components/agents/value_optimization/bs_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Bootstrapped DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Bootstrapped DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Behavioral Cloning" href="../imitation/bc.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -265,7 +268,7 @@ Then, train the online network according to the calculated targets.</p>
        <a href="categorical_dqn.html" class="btn btn-neutral float-right" title="Categorical DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../imitation/bc.html" class="btn btn-neutral" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../imitation/bc.html" class="btn btn-neutral float-left" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -274,7 +277,7 @@ Then, train the online network according to the calculated targets.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -291,27 +294,16 @@ Then, train the online network according to the calculated targets.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/categorical_dqn.html
+++ b/docs/components/agents/value_optimization/categorical_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Categorical DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Categorical DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Bootstrapped DQN" href="bs_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,43 +230,36 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic">
-<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
-</li>
-<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
 that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
 <p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
 <p>where:
 *  <span class="math notranslate nohighlight">\([ \cdot ]\)</span> bounds its argument in the range <span class="math notranslate nohighlight">\([a, b]\)</span>
 *  <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r+\gamma z_j\)</span></p>
 </li>
-<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
-probability distribution.   Only the target of the actions that were actually taken is updated.</p>
-</li>
-<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
-</li>
+<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
+probability distribution.   Only the target of the actions that were actually taken is updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.categorical_dqn_agent.</code><code class="descname">CategoricalDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/categorical_dqn_agent.html#CategoricalDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>v_min</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>v_min</strong> – (float)
 The minimal value that will be represented in the network output for predicting the Q value.
-Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</li>
-<li><strong>v_max</strong> – (float)
+Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</p></li>
+<li><p><strong>v_max</strong> – (float)
 The maximum value that will be represented in the network output for predicting the Q value.
-Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</li>
-<li><strong>atoms</strong> – (int)
+Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</p></li>
+<li><p><strong>atoms</strong> – (int)
 The number of atoms that will be used to discretize the range between v_min and v_max.
-For the C51 algorithm described in the paper, the number of atoms is 51.</li>
+For the C51 algorithm described in the paper, the number of atoms is 51.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -281,7 +277,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
        <a href="../imitation/cil.html" class="btn btn-neutral float-right" title="Conditional Imitation Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="bs_dqn.html" class="btn btn-neutral" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="bs_dqn.html" class="btn btn-neutral float-left" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -290,7 +286,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -307,27 +303,16 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/double_dqn.html
+++ b/docs/components/agents/value_optimization/double_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Double DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Double DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Direct Future Prediction" href="../other/dfp.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,17 +230,17 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
 action <span class="math notranslate nohighlight">\(argmax_a Q(s_{t+1},a)\)</span>. For these actions, use the corresponding next states and run the target
-network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</li>
-<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
+network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</p></li>
+<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
 use the current states from the sampled batch, and run the online network to get the current Q values predictions.
-Set those values as the targets for the actions that were not actually played.</li>
-<li>For each action that was played, use the following equation for calculating the targets of the network:
-<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
-<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+Set those values as the targets for the actions that were not actually played.</p></li>
+<li><p>For each action that was played, use the following equation for calculating the targets of the network:
+<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
+<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 </div>
 </div>
@@ -254,7 +257,7 @@ Set those values as the targets for the actions that were not actually played.</
        <a href="dqn.html" class="btn btn-neutral float-right" title="Deep Q Networks" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../other/dfp.html" class="btn btn-neutral" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../other/dfp.html" class="btn btn-neutral float-left" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -263,7 +266,7 @@ Set those values as the targets for the actions that were not actually played.</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -280,27 +283,16 @@ Set those values as the targets for the actions that were not actually played.</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/dqn.html
+++ b/docs/components/agents/value_optimization/dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Deep Q Networks &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Deep Q Networks &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Double DQN" href="double_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,16 +230,16 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
-the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</li>
-<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
+the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</p></li>
+<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
 use the current states from the sampled batch, and run the online network to get the current Q values predictions.
-Set those values as the targets for the actions that were not actually played.</li>
-<li>For each action that was played, use the following equation for calculating the targets of the network:
-<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></li>
-<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+Set those values as the targets for the actions that were not actually played.</p></li>
+<li><p>For each action that was played, use the following equation for calculating the targets of the network:
+<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></p></li>
+<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.dqn_agent.DQNAlgorithmParameters">
@@ -258,7 +261,7 @@ Set those values as the targets for the actions that were not actually played.</
        <a href="dueling_dqn.html" class="btn btn-neutral float-right" title="Dueling DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="double_dqn.html" class="btn btn-neutral" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="double_dqn.html" class="btn btn-neutral float-left" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -267,7 +270,7 @@ Set those values as the targets for the actions that were not actually played.</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -284,27 +287,16 @@ Set those values as the targets for the actions that were not actually played.</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/dueling_dqn.html
+++ b/docs/components/agents/value_optimization/dueling_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Dueling DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Dueling DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Deep Q Networks" href="dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -245,7 +248,7 @@ single action has been taken at this state.</p>
        <a href="mmc.html" class="btn btn-neutral float-right" title="Mixed Monte Carlo" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="dqn.html" class="btn btn-neutral" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="dqn.html" class="btn btn-neutral float-left" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -254,7 +257,7 @@ single action has been taken at this state.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -271,27 +274,16 @@ single action has been taken at this state.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/mmc.html
+++ b/docs/components/agents/value_optimization/mmc.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Mixed Monte Carlo &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Mixed Monte Carlo &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Dueling DQN" href="dueling_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -238,16 +241,13 @@ Once in every few thousand steps, copy the weights from the online network to th
 <dl class="class">
 <dt id="rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.mmc_agent.</code><code class="descname">MixedMonteCarloAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/mmc_agent.html#MixedMonteCarloAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>monte_carlo_mixing_rate</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>monte_carlo_mixing_rate</strong> – (float)
 The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
-the single-step bootstrapped targets.</td>
-</tr>
-</tbody>
-</table>
+the single-step bootstrapped targets.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -265,7 +265,7 @@ the single-step bootstrapped targets.</td>
        <a href="n_step.html" class="btn btn-neutral float-right" title="N-Step Q Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="dueling_dqn.html" class="btn btn-neutral" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="dueling_dqn.html" class="btn btn-neutral float-left" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -274,7 +274,7 @@ the single-step bootstrapped targets.</td>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -291,27 +291,16 @@ the single-step bootstrapped targets.</td>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/n_step.html
+++ b/docs/components/agents/value_optimization/n_step.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>N-Step Q Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>N-Step Q Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Mixed Monte Carlo" href="mmc.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -228,43 +231,39 @@
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>The <span class="math notranslate nohighlight">\(N\)</span>-step Q learning algorithm works in similar manner to DQN except for the following changes:</p>
 <ol class="arabic simple">
-<li>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
-<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</li>
-<li>In order to stabilize the learning, multiple workers work together to update the network.
-This creates the same effect as uncorrelating the samples used for training.</li>
-<li>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
+<li><p>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
+<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</p></li>
+<li><p>In order to stabilize the learning, multiple workers work together to update the network.
+This creates the same effect as uncorrelating the samples used for training.</p></li>
+<li><p>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
 to form the <span class="math notranslate nohighlight">\(N\)</span>-step Q targets, according to the following equation:
 <span class="math notranslate nohighlight">\(R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})\)</span>
-where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</li>
+where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.n_step_q_agent.</code><code class="descname">NStepQAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/n_step_q_agent.html#NStepQAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes between applying the accumulated gradients to the network. After every
 num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
 it will then accumulate it in internal accumulators, and will only apply them to the network once in every
-apply_gradients_every_x_episodes episodes.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+apply_gradients_every_x_episodes episodes.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
 called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
-are used in the batch.</li>
-<li><strong>targets_horizon</strong> – (str)
+are used in the batch.</p></li>
+<li><p><strong>targets_horizon</strong> – (str)
 Should be either ‘N-Step’ or ‘1-Step’, and defines the length for which to bootstrap the network values over.
 Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
-please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</li>
+please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -282,7 +281,7 @@ please refer to the original paper (<a class="reference external" href="https://
        <a href="naf.html" class="btn btn-neutral float-right" title="Normalized Advantage Functions" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="mmc.html" class="btn btn-neutral" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="mmc.html" class="btn btn-neutral float-left" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -291,7 +290,7 @@ please refer to the original paper (<a class="reference external" href="https://

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -308,27 +307,16 @@ please refer to the original paper (<a class="reference external" href="https://
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/naf.html
+++ b/docs/components/agents/value_optimization/naf.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Normalized Advantage Functions &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Normalized Advantage Functions &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="N-Step Q Learning" href="n_step.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -258,7 +261,7 @@ After every training step, use a soft update in order to copy the weights from t
        <a href="nec.html" class="btn btn-neutral float-right" title="Neural Episodic Control" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="n_step.html" class="btn btn-neutral" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="n_step.html" class="btn btn-neutral float-left" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -267,7 +270,7 @@ After every training step, use a soft update in order to copy the weights from t

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -284,27 +287,16 @@ After every training step, use a soft update in order to copy the weights from t
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/nec.html
+++ b/docs/components/agents/value_optimization/nec.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Neural Episodic Control &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Neural Episodic Control &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Normalized Advantage Functions" href="naf.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -229,14 +232,14 @@
 <div class="section" id="choosing-an-action">
 <h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
-output from the middleware.</li>
-<li>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
+<li><p>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
+output from the middleware.</p></li>
+<li><p>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
 The DND is queried and returns the <span class="math notranslate nohighlight">\(P\)</span> nearest neighbor keys and values. The keys and values are used to calculate
-and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</li>
-<li>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</li>
-<li>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
-accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</li>
+and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</p></li>
+<li><p>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</p></li>
+<li><p>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
+accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</p></li>
 </ol>
 </div>
 <div class="section" id="finalizing-an-episode">
@@ -256,40 +259,36 @@ the network if necessary:
 <dl class="class">
 <dt id="rl_coach.agents.nec_agent.NECAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.nec_agent.</code><code class="descname">NECAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/nec_agent.html#NECAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.nec_agent.NECAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>dnd_size</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dnd_size</strong> – (int)
 Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
-of transitions that will be stored is dnd_size x num_actions.</li>
-<li><strong>l2_norm_added_delta</strong> – (float)
+of transitions that will be stored is dnd_size x num_actions.</p></li>
+<li><p><strong>l2_norm_added_delta</strong> – (float)
 A small value that will be added when calculating the weight of each of the DND entries. This follows the
-<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</li>
-<li><strong>new_value_shift_coefficient</strong> – (float)
+<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</p></li>
+<li><p><strong>new_value_shift_coefficient</strong> – (float)
 In the case where a ew embedding that was added to the DND was already present, the value that will be stored
 in the DND is a mix between the existing value and the new value. The mix rate is defined by
-new_value_shift_coefficient.</li>
-<li><strong>number_of_knn</strong> – (int)
-The number of neighbors that will be retrieved for each DND query.</li>
-<li><strong>DND_key_error_threshold</strong> – (float)
+new_value_shift_coefficient.</p></li>
+<li><p><strong>number_of_knn</strong> – (int)
+The number of neighbors that will be retrieved for each DND query.</p></li>
+<li><p><strong>DND_key_error_threshold</strong> – (float)
 When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
-exists in the DND, since exact matches of embeddings are very rare.</li>
-<li><strong>propagate_updates_to_DND</strong> – (bool)
+exists in the DND, since exact matches of embeddings are very rare.</p></li>
+<li><p><strong>propagate_updates_to_DND</strong> – (bool)
 If set to True, when the gradients of the network will be calculated, the gradients will also be
 backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
-network weights.</li>
-<li><strong>n_step</strong> – (int)
-The bootstrap length that will be used when calculating the state values to store in the DND.</li>
-<li><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
+network weights.</p></li>
+<li><p><strong>n_step</strong> – (int)
+The bootstrap length that will be used when calculating the state values to store in the DND.</p></li>
+<li><p><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
 If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
-when the state was first seen, and not the latest, most up-to-date network value.</li>
+when the state was first seen, and not the latest, most up-to-date network value.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -307,7 +306,7 @@ when the state was first seen, and not the latest, most up-to-date network value
        <a href="pal.html" class="btn btn-neutral float-right" title="Persistent Advantage Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="naf.html" class="btn btn-neutral" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="naf.html" class="btn btn-neutral float-left" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -316,7 +315,7 @@ when the state was first seen, and not the latest, most up-to-date network value

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -333,27 +332,16 @@ when the state was first seen, and not the latest, most up-to-date network value
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/pal.html
+++ b/docs/components/agents/value_optimization/pal.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Persistent Advantage Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Persistent Advantage Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Neural Episodic Control" href="nec.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,47 +230,43 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Start by calculating the initial target values in the same manner as they are calculated in DDQN
-<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
-<li>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Start by calculating the initial target values in the same manner as they are calculated in DDQN
+<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
+<li><p>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
 To calculate the action gap, run the target network using the current states and get the <span class="math notranslate nohighlight">\(Q\)</span> values
 for all the actions. Then estimate <span class="math notranslate nohighlight">\(V\)</span> as the maximum predicted <span class="math notranslate nohighlight">\(Q\)</span> value for the current state:
-<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></li>
-<li>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
+<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></p></li>
+<li><p>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
 the targets <span class="math notranslate nohighlight">\(y_t^{DDQN}\)</span>:
-<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></li>
-<li>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
+<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></p></li>
+<li><p>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
 gap for the next state:
 <span class="math notranslate nohighlight">\(V(s_{t+1} )-Q(s_{t+1},a_{t+1})\)</span>
 where <span class="math notranslate nohighlight">\(a_{t+1}\)</span> is chosen by running the next states through the online network and choosing the action that
 has the highest predicted <span class="math notranslate nohighlight">\(Q\)</span> value. Finally, the targets will be defined as -
-<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></li>
-<li>Train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></p></li>
+<li><p>Train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.pal_agent.PALAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.pal_agent.</code><code class="descname">PALAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/pal_agent.html#PALAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.pal_agent.PALAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>pal_alpha</strong> – (float)
-A factor that weights the amount by which the advantage learning update will be taken into account.</li>
-<li><strong>persistent_advantage_learning</strong> – (bool)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>pal_alpha</strong> – (float)
+A factor that weights the amount by which the advantage learning update will be taken into account.</p></li>
+<li><p><strong>persistent_advantage_learning</strong> – (bool)
 If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
-the same actions one after the other instead of changing actions.</li>
-<li><strong>monte_carlo_mixing_rate</strong> – (float)
+the same actions one after the other instead of changing actions.</p></li>
+<li><p><strong>monte_carlo_mixing_rate</strong> – (float)
 The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
 total discounted returns, and they can help reduce the time it takes for the network to update to the newly
-seen values, since it is not based on bootstrapping the current network values.</li>
+seen values, since it is not based on bootstrapping the current network values.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -285,7 +284,7 @@ seen values, since it is not based on bootstrapping the current network values.<
        <a href="../policy_optimization/pg.html" class="btn btn-neutral float-right" title="Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="nec.html" class="btn btn-neutral" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="nec.html" class="btn btn-neutral float-left" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -294,7 +293,7 @@ seen values, since it is not based on bootstrapping the current network values.<

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -311,27 +310,16 @@ seen values, since it is not based on bootstrapping the current network values.<
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/qr_dqn.html
+++ b/docs/components/agents/value_optimization/qr_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Quantile Regression DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Quantile Regression DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Rainbow" href="rainbow.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,33 +230,29 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
 by following the Bellman equation.
 Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
-quantile midpoints targets.</li>
-<li>The network is trained with the quantile regression loss between the resulting quantile locations and the target
-quantile locations. Only the targets of the actions that were actually taken are updated.</li>
-<li>Once in every few thousand steps, weights are copied from the online network to the target network.</li>
+quantile midpoints targets.</p></li>
+<li><p>The network is trained with the quantile regression loss between the resulting quantile locations and the target
+quantile locations. Only the targets of the actions that were actually taken are updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.qr_dqn_agent.</code><code class="descname">QuantileRegressionDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/qr_dqn_agent.html#QuantileRegressionDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>atoms</strong> – (int)
-the number of atoms to predict for each action</li>
-<li><strong>huber_loss_interval</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>atoms</strong> – (int)
+the number of atoms to predict for each action</p></li>
+<li><p><strong>huber_loss_interval</strong> – (float)
 One of the huber loss parameters, and is referred to as <span class="math notranslate nohighlight">\(\kapa\)</span> in the paper.
-It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</li>
+It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -271,7 +270,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
        <a href="../../architectures/index.html" class="btn btn-neutral float-right" title="Architectures" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="rainbow.html" class="btn btn-neutral" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="rainbow.html" class="btn btn-neutral float-left" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -280,7 +279,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -297,27 +296,16 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/rainbow.html
+++ b/docs/components/agents/value_optimization/rainbow.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Rainbow &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Rainbow &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Proximal Policy Optimization" href="../policy_optimization/ppo.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -226,19 +229,18 @@
 <h2>Algorithm Description<a class="headerlink" href="#algorithm-description" title="Permalink to this headline">¶</a></h2>
 <p>Rainbow combines 6 recent advancements in reinforcement learning:</p>
 <ul class="simple">
-<li>N-step returns</li>
-<li>Distributional state-action value learning</li>
-<li>Dueling networks</li>
-<li>Noisy Networks</li>
-<li>Double DQN</li>
-<li>Prioritized Experience Replay</li>
+<li><p>N-step returns</p></li>
+<li><p>Distributional state-action value learning</p></li>
+<li><p>Dueling networks</p></li>
+<li><p>Noisy Networks</p></li>
+<li><p>Double DQN</p></li>
+<li><p>Prioritized Experience Replay</p></li>
 </ul>
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic">
-<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
-</li>
-<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
 that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
 <p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
 <p>where:
@@ -246,36 +248,29 @@ that the <span class="math notranslate nohighlight">\(i-th\)</span> component of
 *  <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom
 <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r_t+\gamma r_{t+1} + ... + \gamma r_{t+n-1} + \gamma^{n-1} z_j\)</span></p>
 </li>
-<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
-probability distribution.   Only the target of the actions that were actually taken is updated.</p>
-</li>
-<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
-</li>
-<li><p class="first">After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
-using the KL divergence loss that is returned from the network.</p>
-</li>
+<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
+probability distribution.   Only the target of the actions that were actually taken is updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
+<li><p>After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
+using the KL divergence loss that is returned from the network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.rainbow_dqn_agent.</code><code class="descname">RainbowDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/rainbow_dqn_agent.html#RainbowDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>n_step</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>n_step</strong> – (int)
 The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
 using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
-prediction.</li>
-<li><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
+prediction.</p></li>
+<li><p><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
 If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
 written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
-transitions into the memory, and to do so we need the entire episode first.</li>
+transitions into the memory, and to do so we need the entire episode first.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -293,7 +288,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
        <a href="qr_dqn.html" class="btn btn-neutral float-right" title="Quantile Regression DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/ppo.html" class="btn btn-neutral" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/ppo.html" class="btn btn-neutral float-left" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -302,7 +297,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -319,27 +314,16 @@ transitions into the memory, and to do so we need the entire episode first.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/architectures/index.html
+++ b/docs/components/architectures/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Architectures &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Architectures &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Quantile Regression DQN" href="../agents/value_optimization/qr_dqn.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -194,75 +197,71 @@ parts that are implemented using TensorFlow.</p>
 <dl class="class">
 <dt id="rl_coach.base_parameters.NetworkParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">NetworkParameters</code><span class="sig-paren">(</span><em>force_cpu=False</em>, <em>async_training=False</em>, <em>shared_optimizer=True</em>, <em>scale_down_gradients_by_number_of_workers_for_sync_training=True</em>, <em>clip_gradients=None</em>, <em>gradients_clipping_method=&lt;GradientClippingMethod.ClipByGlobalNorm: 0&gt;</em>, <em>l2_regularization=0</em>, <em>learning_rate=0.00025</em>, <em>learning_rate_decay_rate=0</em>, <em>learning_rate_decay_steps=0</em>, <em>input_embedders_parameters={}</em>, <em>embedding_merger_type=&lt;EmbeddingMergerType.Concat: 0&gt;</em>, <em>middleware_parameters=None</em>, <em>heads_parameters=[]</em>, <em>use_separate_networks_per_head=False</em>, <em>optimizer_type='Adam'</em>, <em>optimizer_epsilon=0.0001</em>, <em>adam_optimizer_beta1=0.9</em>, <em>adam_optimizer_beta2=0.99</em>, <em>rms_prop_optimizer_decay=0.9</em>, <em>batch_size=32</em>, <em>replace_mse_with_huber_loss=False</em>, <em>create_target_network=False</em>, <em>tensorflow_support=True</em>, <em>softmax_temperature=1</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/base_parameters.html#NetworkParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.NetworkParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>force_cpu</strong> – Force the neural networks to run on the CPU even if a GPU is available</li>
-<li><strong>async_training</strong> – If set to True, asynchronous training will be used, meaning that each workers will progress in its own
-speed, while not waiting for the rest of the workers to calculate their gradients.</li>
-<li><strong>shared_optimizer</strong> – If set to True, a central optimizer which will be shared with all the workers will be used for applying
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>force_cpu</strong> – Force the neural networks to run on the CPU even if a GPU is available</p></li>
+<li><p><strong>async_training</strong> – If set to True, asynchronous training will be used, meaning that each workers will progress in its own
+speed, while not waiting for the rest of the workers to calculate their gradients.</p></li>
+<li><p><strong>shared_optimizer</strong> – If set to True, a central optimizer which will be shared with all the workers will be used for applying
 gradients to the network. Otherwise, each worker will have its own optimizer with its own internal
-parameters that will only be affected by the gradients calculated by that worker</li>
-<li><strong>scale_down_gradients_by_number_of_workers_for_sync_training</strong> – If set to True, in synchronous training, the gradients of each worker will be scaled down by the
+parameters that will only be affected by the gradients calculated by that worker</p></li>
+<li><p><strong>scale_down_gradients_by_number_of_workers_for_sync_training</strong> – If set to True, in synchronous training, the gradients of each worker will be scaled down by the
 number of workers. This essentially means that the gradients applied to the network are the average
-of the gradients over all the workers.</li>
-<li><strong>clip_gradients</strong> – A value that will be used for clipping the gradients of the network. If set to None, no gradient clipping
-will be applied. Otherwise, the gradients will be clipped according to the gradients_clipping_method.</li>
-<li><strong>gradients_clipping_method</strong> – A gradient clipping method, defined by a GradientClippingMethod enum, and that will be used to clip the
+of the gradients over all the workers.</p></li>
+<li><p><strong>clip_gradients</strong> – A value that will be used for clipping the gradients of the network. If set to None, no gradient clipping
+will be applied. Otherwise, the gradients will be clipped according to the gradients_clipping_method.</p></li>
+<li><p><strong>gradients_clipping_method</strong> – A gradient clipping method, defined by a GradientClippingMethod enum, and that will be used to clip the
 gradients of the network. This will only be used if the clip_gradients value is defined as a value other
-than None.</li>
-<li><strong>l2_regularization</strong> – A L2 regularization weight that will be applied to the network weights while calculating the loss function</li>
-<li><strong>learning_rate</strong> – The learning rate for the network</li>
-<li><strong>learning_rate_decay_rate</strong> – If this value is larger than 0, an exponential decay will be applied to the network learning rate.
+than None.</p></li>
+<li><p><strong>l2_regularization</strong> – A L2 regularization weight that will be applied to the network weights while calculating the loss function</p></li>
+<li><p><strong>learning_rate</strong> – The learning rate for the network</p></li>
+<li><p><strong>learning_rate_decay_rate</strong> – If this value is larger than 0, an exponential decay will be applied to the network learning rate.
 The rate of the decay is defined by this parameter, and the number of training steps the decay will be
 applied is defined by learning_rate_decay_steps. Notice that both parameters should be defined in order
-for this to work correctly.</li>
-<li><strong>learning_rate_decay_steps</strong> – If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
+for this to work correctly.</p></li>
+<li><p><strong>learning_rate_decay_steps</strong> – If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
 the network learning rate. The number of steps the decay will be applied is defined by this parameter.
 Notice that both this parameter, as well as learning_rate_decay_rate should be defined in order for the
-learning rate decay to work correctly.</li>
-<li><strong>input_embedders_parameters</strong> – A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
+learning rate decay to work correctly.</p></li>
+<li><p><strong>input_embedders_parameters</strong> – A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
 network. Each of the keys is an input name as returned from the environment in the state.
 For example, if the environment returns a state containing ‘observation’ and ‘measurements’, then
 the keys for the input embedders dictionary can be either ‘observation’ to use the observation as input,
 ‘measurements’ to use the measurements as input, or both.
 The embedder type will be automatically selected according to the input type. Vector inputs will
-produce a fully connected embedder, and image inputs will produce a convolutional embedder.</li>
-<li><strong>embedding_merger_type</strong> – The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
-This will be used to merge the outputs of all the input embedders into a single embbeding.</li>
-<li><strong>middleware_parameters</strong> – The parameters of the middleware to use, given by a MiddlewareParameters object.
+produce a fully connected embedder, and image inputs will produce a convolutional embedder.</p></li>
+<li><p><strong>embedding_merger_type</strong> – The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
+This will be used to merge the outputs of all the input embedders into a single embbeding.</p></li>
+<li><p><strong>middleware_parameters</strong> – The parameters of the middleware to use, given by a MiddlewareParameters object.
 Each network will have only a single middleware embedder which will take the merged embeddings from the
-input embedders and pass them through more neural network layers.</li>
-<li><strong>heads_parameters</strong> – A list of heads for the network given by their corresponding HeadParameters.
+input embedders and pass them through more neural network layers.</p></li>
+<li><p><strong>heads_parameters</strong> – A list of heads for the network given by their corresponding HeadParameters.
 Each network can have one or multiple network heads, where each one will take the output of the middleware
 and make some additional computation on top of it. Additionally, each head calculates a weighted loss value,
-and the loss values from all the heads will be summed later on.</li>
-<li><strong>use_separate_networks_per_head</strong> – A flag that allows using different copies of the input embedders and middleware for each one of the heads.
+and the loss values from all the heads will be summed later on.</p></li>
+<li><p><strong>use_separate_networks_per_head</strong> – A flag that allows using different copies of the input embedders and middleware for each one of the heads.
 Regularly, the heads will have a shared input, but in the case where use_separate_networks_per_head is set
-to True, each one of the heads will get a different input.</li>
-<li><strong>optimizer_type</strong> – A string specifying the optimizer type to use for updating the network. The available optimizers are
-Adam, RMSProp and LBFGS.</li>
-<li><strong>optimizer_epsilon</strong> – An internal optimizer parameter used for Adam and RMSProp.</li>
-<li><strong>adam_optimizer_beta1</strong> – An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
-optimizer for the network.</li>
-<li><strong>adam_optimizer_beta2</strong> – An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
-optimizer for the network.</li>
-<li><strong>rms_prop_optimizer_decay</strong> – The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
-selected for this network.</li>
-<li><strong>batch_size</strong> – The batch size to use when updating the network.</li>
-<li><strong>replace_mse_with_huber_loss</strong> – </li>
-<li><strong>create_target_network</strong> – If this flag is set to True, an additional copy of the network will be created and initialized with the
+to True, each one of the heads will get a different input.</p></li>
+<li><p><strong>optimizer_type</strong> – A string specifying the optimizer type to use for updating the network. The available optimizers are
+Adam, RMSProp and LBFGS.</p></li>
+<li><p><strong>optimizer_epsilon</strong> – An internal optimizer parameter used for Adam and RMSProp.</p></li>
+<li><p><strong>adam_optimizer_beta1</strong> – An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
+optimizer for the network.</p></li>
+<li><p><strong>adam_optimizer_beta2</strong> – An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
+optimizer for the network.</p></li>
+<li><p><strong>rms_prop_optimizer_decay</strong> – The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
+selected for this network.</p></li>
+<li><p><strong>batch_size</strong> – The batch size to use when updating the network.</p></li>
+<li><p><strong>replace_mse_with_huber_loss</strong> – </p></li>
+<li><p><strong>create_target_network</strong> – If this flag is set to True, an additional copy of the network will be created and initialized with the
 same weights as the online network. It can then be queried, and its weights can be synced from the
-online network at will.</li>
-<li><strong>tensorflow_support</strong> – A flag which specifies if the network is supported by the TensorFlow framework.</li>
-<li><strong>softmax_temperature</strong> – If a softmax is present in the network head output, use this temperature</li>
+online network at will.</p></li>
+<li><p><strong>tensorflow_support</strong> – A flag which specifies if the network is supported by the TensorFlow framework.</p></li>
+<li><p><strong>softmax_temperature</strong> – If a softmax is present in the network head output, use this temperature</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <div class="section" id="architecture">
@@ -271,19 +270,15 @@ online network at will.</li>
 <dt id="rl_coach.architectures.architecture.Architecture">
 <em class="property">class </em><code class="descclassname">rl_coach.architectures.architecture.</code><code class="descname">Architecture</code><span class="sig-paren">(</span><em>agent_parameters: rl_coach.base_parameters.AgentParameters</em>, <em>spaces: rl_coach.spaces.SpacesDefinition</em>, <em>name: str = ''</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture" title="Permalink to this definition">¶</a></dt>
 <dd><p>Creates a neural network ‘architecture’, that can be trained and used for inference.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>agent_parameters</strong> – the agent parameters</li>
-<li><strong>spaces</strong> – the spaces (observation, action, etc.) definition of the agent</li>
-<li><strong>name</strong> – the name of the network</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>agent_parameters</strong> – the agent parameters</p></li>
+<li><p><strong>spaces</strong> – the spaces (observation, action, etc.) definition of the agent</p></li>
+<li><p><strong>name</strong> – the name of the network</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.architectures.architecture.Architecture.accumulate_gradients">
 <code class="descname">accumulate_gradients</code><span class="sig-paren">(</span><em>inputs: Dict[str, numpy.ndarray], targets: List[numpy.ndarray], additional_fetches: list = None, importance_weights: numpy.ndarray = None, no_accumulation: bool = False</em><span class="sig-paren">)</span> &#x2192; Tuple[float, List[float], float, list]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.accumulate_gradients"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.accumulate_gradients" title="Permalink to this definition">¶</a></dt>
@@ -292,29 +287,28 @@ gradients for model parameters. Will run forward and backward pass to compute gr
 values if required and then accumulate gradients from all learners. It does not update the model weights,
 that’s performed in <cite>apply_and_reset_gradients</cite> method.</p>
 <p>Once gradients are accumulated, they are accessed by <cite>accumulated_gradients</cite> property of this class.å</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>inputs</strong> – <p>typically the environment states (but can also contain other data for loss)
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>inputs</strong> – <p>typically the environment states (but can also contain other data for loss)
 (e.g. <cite>{‘observation’: numpy.ndarray}</cite> with <cite>observation</cite> of shape (batch_size, observation_space_size) or</p>
 <blockquote>
-<div>(batch_size, observation_space_size, stack_size) or</div></blockquote>
+<div><p>(batch_size, observation_space_size, stack_size) or</p>
+</div></blockquote>
 <p><cite>{‘observation’: numpy.ndarray, ‘output_0_0’: numpy.ndarray}</cite> with <cite>output_0_0</cite> of shape (batch_size,))</p>
-</li>
-<li><strong>targets</strong> – targets for calculating loss. For example discounted rewards for value network
+</p></li>
+<li><p><strong>targets</strong> – targets for calculating loss. For example discounted rewards for value network
 for calculating the value-network loss would be a target. Length of list and order of arrays in
-the list matches that of network losses which are defined by network parameters</li>
-<li><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
-element is framework dependent.</li>
-<li><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</li>
-<li><strong>no_accumulation</strong> – if True, set gradient values to the new gradients, otherwise sum with previously
-calculated gradients</li>
+the list matches that of network losses which are defined by network parameters</p></li>
+<li><p><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
+element is framework dependent.</p></li>
+<li><p><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</p></li>
+<li><p><strong>no_accumulation</strong> – if True, set gradient values to the new gradients, otherwise sum with previously
+calculated gradients</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
 total_loss (float): sum of all head losses
 losses (list of float): list of all losses. The order is list of target losses followed by list of</p>
 <blockquote>
@@ -324,10 +318,8 @@ losses (list of float): list of all losses. The order is list of target losses f
 <p>norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
 fetched_tensors: all values for additional_fetches</p>
 </p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -335,19 +327,15 @@ fetched_tensors: all values for additional_fetches</p>
 <code class="descname">apply_and_reset_gradients</code><span class="sig-paren">(</span><em>gradients: List[numpy.ndarray], scaler: float = 1.0</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.apply_and_reset_gradients"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.apply_and_reset_gradients" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies the given gradients to the network weights and resets the gradient accumulations.
 Has the same impact as calling <cite>apply_gradients</cite>, then <cite>reset_accumulated_gradients</cite>.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
-of an identical network (either self or another identical network)</li>
-<li><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
+of an identical network (either self or another identical network)</p></li>
+<li><p><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -355,19 +343,15 @@ of an identical network (either self or another identical network)</li>
 <code class="descname">apply_gradients</code><span class="sig-paren">(</span><em>gradients: List[numpy.ndarray], scaler: float = 1.0</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.apply_gradients"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.apply_gradients" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies the given gradients to the network weights.
 Will be performed sync or async depending on <cite>network_parameters.async_training</cite></p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
-of an identical network (either self or another identical network)</li>
-<li><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
+of an identical network (either self or another identical network)</p></li>
+<li><p><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -376,15 +360,13 @@ of an identical network (either self or another identical network)</li>
 <dd><p>Collection of all savers for the network (typically only one saver for network and one for ONNX export)
 :param parent_path_suffix: path suffix of the parent of the network</p>
 <blockquote>
-<div>(e.g. could be name of level manager plus name of agent)</div></blockquote>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">saver collection for the network</td>
-</tr>
-</tbody>
-</table>
+<div><p>(e.g. could be name of level manager plus name of agent)</p>
+</div></blockquote>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>saver collection for the network</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="staticmethod">
@@ -404,75 +386,62 @@ of an identical network (either self or another identical network)</li>
 <dd><p>Gets value of a specified variable. Type of variable is dependant on the framework.
 Example of a variable is head.kl_coefficient, which could be a symbol for evaluation
 or could be a string representing the value.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>variable</strong> – variable of interest</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">value of the specified variable</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>variable</strong> – variable of interest</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>value of the specified variable</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.architecture.Architecture.get_weights">
 <code class="descname">get_weights</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; List[numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.get_weights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.get_weights" title="Permalink to this definition">¶</a></dt>
 <dd><p>Gets model weights as a list of ndarrays. It is used for synchronizing weight between two identical networks.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">list weights as ndarray</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>list weights as ndarray</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="staticmethod">
 <dt id="rl_coach.architectures.architecture.Architecture.parallel_predict">
 <em class="property">static </em><code class="descname">parallel_predict</code><span class="sig-paren">(</span><em>sess: Any, network_input_tuples: List[Tuple[Architecture, Dict[str, numpy.ndarray]]]</em><span class="sig-paren">)</span> &#x2192; Tuple[numpy.ndarray, ...]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.parallel_predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.parallel_predict" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>sess</strong> – active session to use for prediction</li>
-<li><strong>network_input_tuples</strong> – tuple of network and corresponding input</li>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>sess</strong> – active session to use for prediction</p></li>
+<li><p><strong>network_input_tuples</strong> – tuple of network and corresponding input</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">list or tuple of outputs from all networks</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>list or tuple of outputs from all networks</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.architecture.Architecture.predict">
 <code class="descname">predict</code><span class="sig-paren">(</span><em>inputs: Dict[str, numpy.ndarray], outputs: List[Any] = None, squeeze_output: bool = True, initial_feed_dict: Dict[Any, numpy.ndarray] = None</em><span class="sig-paren">)</span> &#x2192; Tuple[numpy.ndarray, ...]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.predict" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given input observations, use the model to make predictions (e.g. action or value).</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>inputs</strong> – current state (i.e. observations, measurements, goals, etc.)
-(e.g. <cite>{‘observation’: numpy.ndarray}</cite> of shape (batch_size, observation_space_size))</li>
-<li><strong>outputs</strong> – list of outputs to return. Return all outputs if unspecified. Type of the list elements
-depends on the framework backend.</li>
-<li><strong>squeeze_output</strong> – call squeeze_list on output before returning if True</li>
-<li><strong>initial_feed_dict</strong> – a dictionary of extra inputs for forward pass.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>inputs</strong> – current state (i.e. observations, measurements, goals, etc.)
+(e.g. <cite>{‘observation’: numpy.ndarray}</cite> of shape (batch_size, observation_space_size))</p></li>
+<li><p><strong>outputs</strong> – list of outputs to return. Return all outputs if unspecified. Type of the list elements
+depends on the framework backend.</p></li>
+<li><p><strong>squeeze_output</strong> – call squeeze_list on output before returning if True</p></li>
+<li><p><strong>initial_feed_dict</strong> – a dictionary of extra inputs for forward pass.</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">predictions of action or value of shape (batch_size, action_space_size) for action predictions)</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>predictions of action or value of shape (batch_size, action_space_size) for action predictions)</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -490,41 +459,33 @@ which must return a list of numpy ndarrays. Child class must ensure that <cite>a
 and is a unique identifier for assigning value to a variable. For example an agent may use
 head.assign_kl_coefficient. There is a one to one mapping between assign_op and placeholder
 (in the example above, placeholder would be head.kl_coefficient_ph).</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>assign_op</strong> – a parameter representing the operation for assigning value to a specific variable</li>
-<li><strong>value</strong> – value of the specified variable used for update</li>
-<li><strong>placeholder</strong> – a placeholder for binding the value to assign_op.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>assign_op</strong> – a parameter representing the operation for assigning value to a specific variable</p></li>
+<li><p><strong>value</strong> – value of the specified variable used for update</p></li>
+<li><p><strong>placeholder</strong> – a placeholder for binding the value to assign_op.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.architecture.Architecture.set_weights">
 <code class="descname">set_weights</code><span class="sig-paren">(</span><em>weights: List[numpy.ndarray], rate: float = 1.0</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.set_weights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.set_weights" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sets model weights for provided layer parameters.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>weights</strong> – list of model weights in the same order as received in get_weights</li>
-<li><strong>rate</strong> – controls the mixture of given weight values versus old weight values.
-i.e. new_weight = rate * given_weight + (1 - rate) * old_weight</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>weights</strong> – list of model weights in the same order as received in get_weights</p></li>
+<li><p><strong>rate</strong> – controls the mixture of given weight values versus old weight values.
+i.e. new_weight = rate * given_weight + (1 - rate) * old_weight</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">None</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -535,26 +496,24 @@ forward pass and backward pass of the network, accumulates the gradients and app
 update the weights.
 Calls <cite>accumulate_gradients</cite> followed by <cite>apply_and_reset_gradients</cite>.
 Note: Currently an unused method.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>inputs</strong> – typically the environment states (but can also contain other data necessary for loss).
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>inputs</strong> – typically the environment states (but can also contain other data necessary for loss).
 (e.g. <cite>{‘observation’: numpy.ndarray}</cite> with <cite>observation</cite> of shape (batch_size, observation_space_size) or
 (batch_size, observation_space_size, stack_size) or
-<cite>{‘observation’: numpy.ndarray, ‘output_0_0’: numpy.ndarray}</cite> with <cite>output_0_0</cite> of shape (batch_size,))</li>
-<li><strong>targets</strong> – target values of shape (batch_size, ). For example discounted rewards for value network
+<cite>{‘observation’: numpy.ndarray, ‘output_0_0’: numpy.ndarray}</cite> with <cite>output_0_0</cite> of shape (batch_size,))</p></li>
+<li><p><strong>targets</strong> – target values of shape (batch_size, ). For example discounted rewards for value network
 for calculating the value-network loss would be a target. Length of list and order of arrays in
-the list matches that of network losses which are defined by network parameters</li>
-<li><strong>scaler</strong> – value to scale gradients by before optimizing network weights</li>
-<li><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
-element is framework dependent.</li>
-<li><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</li>
+the list matches that of network losses which are defined by network parameters</p></li>
+<li><p><strong>scaler</strong> – value to scale gradients by before optimizing network weights</p></li>
+<li><p><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
+element is framework dependent.</p></li>
+<li><p><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
 total_loss (float): sum of all head losses
 losses (list of float): list of all losses. The order is list of target losses followed by list</p>
 <blockquote>
@@ -564,10 +523,8 @@ losses (list of float): list of all losses. The order is list of target losses f
 <p>norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
 fetched_tensors: all values for additional_fetches</p>
 </p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -590,47 +547,39 @@ between them.</p>
 <code class="descname">apply_gradients_and_sync_networks</code><span class="sig-paren">(</span><em>reset_gradients=True</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.apply_gradients_and_sync_networks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_and_sync_networks" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies the gradients accumulated in the online network to the global network or to itself and syncs the
 networks if necessary</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>reset_gradients</strong> – If set to True, the accumulated gradients wont be reset to 0 after applying them to
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>reset_gradients</strong> – If set to True, the accumulated gradients wont be reset to 0 after applying them to
 the network. this is useful when the accumulated gradients are overwritten instead
 if accumulated by the accumulate_gradients function. this allows reducing time
-complexity for this function by around 10%</td>
-</tr>
-</tbody>
-</table>
+complexity for this function by around 10%</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_global_network">
 <code class="descname">apply_gradients_to_global_network</code><span class="sig-paren">(</span><em>gradients=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.apply_gradients_to_global_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_global_network" title="Permalink to this definition">¶</a></dt>
 <dd><p>Apply gradients from the online network on the global network</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>gradients</strong> – optional gradients that will be used instead of teh accumulated gradients</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>gradients</strong> – optional gradients that will be used instead of teh accumulated gradients</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p></p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_online_network">
 <code class="descname">apply_gradients_to_online_network</code><span class="sig-paren">(</span><em>gradients=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.apply_gradients_to_online_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_online_network" title="Permalink to this definition">¶</a></dt>
 <dd><p>Apply gradients from the online network on itself</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"></td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p></p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -639,124 +588,106 @@ complexity for this function by around 10%</td>
 <dd><p>Collect all of network’s savers for global or online network
 Note: global, online, and target network are all copies fo the same network which parameters that are</p>
 <blockquote>
-<div>updated at different rates. So we only need to save one of the networks; the one that holds the most
+<div><p>updated at different rates. So we only need to save one of the networks; the one that holds the most
 recent parameters. target network is created for some agents and used for stabilizing training by
 updating parameters from online network at a slower rate. As a result, target network never contains
 the most recent set of parameters. In single-worker training, no global network is created and online
 network contains the most recent parameters. In vertical distributed training with more than one worker,
 global network is updated by all workers and contains the most recent parameters.
 Therefore preference is given to global network if it exists, otherwise online network is used
-for saving.</div></blockquote>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>parent_path_suffix</strong> – path suffix of the parent of the network wrapper
-(e.g. could be name of level manager plus name of agent)</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">collection of all checkpoint objects</td>
-</tr>
-</tbody>
-</table>
+for saving.</p>
+</div></blockquote>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>parent_path_suffix</strong> – path suffix of the parent of the network wrapper
+(e.g. could be name of level manager plus name of agent)</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>collection of all checkpoint objects</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.parallel_prediction">
 <code class="descname">parallel_prediction</code><span class="sig-paren">(</span><em>network_input_tuples: List[Tuple]</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.parallel_prediction"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.parallel_prediction" title="Permalink to this definition">¶</a></dt>
 <dd><p>Run several network prediction in parallel. Currently this only supports running each of the network once.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>network_input_tuples</strong> – a list of tuples where the first element is the network (online_network,
-target_network or global_network) and the second element is the inputs</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the outputs of all the networks in the same order as the inputs were given</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>network_input_tuples</strong> – a list of tuples where the first element is the network (online_network,
+target_network or global_network) and the second element is the inputs</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the outputs of all the networks in the same order as the inputs were given</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.set_is_training">
 <code class="descname">set_is_training</code><span class="sig-paren">(</span><em>state: bool</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.set_is_training"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.set_is_training" title="Permalink to this definition">¶</a></dt>
 <dd><p>Set the phase of the network between training and testing</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – The current state (True = Training, False = Testing)</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>state</strong> – The current state (True = Training, False = Testing)</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.sync">
 <code class="descname">sync</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.sync"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.sync" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initializes the weights of the networks to match each other</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"></td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p></p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.train_and_sync_networks">
 <code class="descname">train_and_sync_networks</code><span class="sig-paren">(</span><em>inputs</em>, <em>targets</em>, <em>additional_fetches=[]</em>, <em>importance_weights=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.train_and_sync_networks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.train_and_sync_networks" title="Permalink to this definition">¶</a></dt>
 <dd><p>A generic training function that enables multi-threading training using a global network if necessary.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>inputs</strong> – The inputs for the network.</li>
-<li><strong>targets</strong> – The targets corresponding to the given inputs</li>
-<li><strong>additional_fetches</strong> – Any additional tensor the user wants to fetch</li>
-<li><strong>importance_weights</strong> – A coefficient for each sample in the batch, which will be used to rescale the loss
-error of this sample. If it is not given, the samples losses won’t be scaled</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>inputs</strong> – The inputs for the network.</p></li>
+<li><p><strong>targets</strong> – The targets corresponding to the given inputs</p></li>
+<li><p><strong>additional_fetches</strong> – Any additional tensor the user wants to fetch</p></li>
+<li><p><strong>importance_weights</strong> – A coefficient for each sample in the batch, which will be used to rescale the loss
+error of this sample. If it is not given, the samples losses won’t be scaled</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">The loss of the training iteration</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>The loss of the training iteration</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.update_online_network">
 <code class="descname">update_online_network</code><span class="sig-paren">(</span><em>rate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.update_online_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.update_online_network" title="Permalink to this definition">¶</a></dt>
 <dd><p>Copy weights: global network &gt;&gt;&gt; online network</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.update_target_network">
 <code class="descname">update_target_network</code><span class="sig-paren">(</span><em>rate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.update_target_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.update_target_network" title="Permalink to this definition">¶</a></dt>
 <dd><p>Copy weights: online network &gt;&gt;&gt; target network</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -775,7 +706,7 @@ error of this sample. If it is not given, the samples losses won’t be scaled</
        <a href="../data_stores/index.html" class="btn btn-neutral float-right" title="Data Stores" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../agents/value_optimization/qr_dqn.html" class="btn btn-neutral" title="Quantile Regression DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../agents/value_optimization/qr_dqn.html" class="btn btn-neutral float-left" title="Quantile Regression DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -784,7 +715,7 @@ error of this sample. If it is not given, the samples losses won’t be scaled</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -801,27 +732,16 @@ error of this sample. If it is not given, the samples losses won’t be scaled</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/core_types.html
+++ b/docs/components/core_types.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Core Types &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Core Types &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../_static/jquery.js"></script>
+        <script type="text/javascript" src="../_static/underscore.js"></script>
+        <script type="text/javascript" src="../_static/doctools.js"></script>
+        <script type="text/javascript" src="../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Orchestrators" href="orchestrators/index.html" />
    <link href="../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -196,23 +199,19 @@
 <dt id="rl_coach.core_types.ActionInfo">
 <em class="property">class </em><code class="descclassname">rl_coach.core_types.</code><code class="descname">ActionInfo</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List], all_action_probabilities: float = 0, action_value: float = 0.0, state_value: float = 0.0, max_action_value: float = None</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/core_types.html#ActionInfo"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.ActionInfo" title="Permalink to this definition">¶</a></dt>
 <dd><p>Action info is a class that holds an action and various additional information details about it</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action</strong> – the action</li>
-<li><strong>all_action_probabilities</strong> – the probability that the action was given when selecting it</li>
-<li><strong>action_value</strong> – the state-action value (Q value) of the action</li>
-<li><strong>state_value</strong> – the state value (V value) of the state where the action was taken</li>
-<li><strong>max_action_value</strong> – in case this is an action that was selected randomly, this is the value of the action
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action</strong> – the action</p></li>
+<li><p><strong>all_action_probabilities</strong> – the probability that the action was given when selecting it</p></li>
+<li><p><strong>action_value</strong> – the state-action value (Q value) of the action</p></li>
+<li><p><strong>state_value</strong> – the state value (V value) of the state where the action was taken</p></li>
+<li><p><strong>max_action_value</strong> – in case this is an action that was selected randomly, this is the value of the action
 that received the maximum value. if no value is given, the action is assumed to be the
-action with the maximum value</li>
+action with the maximum value</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -224,44 +223,37 @@ action with the maximum value</li>
 <dd><p>A wrapper around a list of transitions that helps extracting batches of parameters from it.
 For example, one can extract a list of states corresponding to the list of transitions.
 The class uses lazy evaluation in order to return each of the available parameters.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transitions</strong> – a list of transitions to extract the batch from</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>transitions</strong> – a list of transitions to extract the batch from</p>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.core_types.Batch.actions">
 <code class="descname">actions</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.actions"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.actions" title="Permalink to this definition">¶</a></dt>
 <dd><p>if the actions were not converted to a batch before, extract them to a batch and then return the batch</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the actions batch</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the actions of the batch</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the actions batch</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy array containing all the actions of the batch</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Batch.game_overs">
 <code class="descname">game_overs</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.game_overs"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.game_overs" title="Permalink to this definition">¶</a></dt>
 <dd><p>if the game_overs were not converted to a batch before, extract them to a batch and then return the batch</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the game_overs batch</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the game over flags of the batch</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the game_overs batch</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy array containing all the game over flags of the batch</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -269,16 +261,14 @@ The class uses lazy evaluation in order to return each of the available paramete
 <code class="descname">goals</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.goals"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.goals" title="Permalink to this definition">¶</a></dt>
 <dd><p>if the goals were not converted to a batch before, extract them to a batch and then return the batch
 if the goal was not filled, this will raise an exception</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the goals batch</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the goals of the batch</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the goals batch</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy array containing all the goals of the batch</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -286,16 +276,14 @@ if the goal was not filled, this will raise an exception</p>
 <code class="descname">info</code><span class="sig-paren">(</span><em>key</em>, <em>expand_dims=False</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.info"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.info" title="Permalink to this definition">¶</a></dt>
 <dd><p>if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
 batch. if the key is not part of the keys in the info dictionary, this will raise an exception</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the info batch</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the info values of the batch corresponding to the given key</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the info batch</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy array containing all the info values of the batch corresponding to the given key</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -309,9 +297,9 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
 <dl class="method">
 <dt id="rl_coach.core_types.Batch.n_step_discounted_rewards">
 <code class="descname">n_step_discounted_rewards</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.n_step_discounted_rewards"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.n_step_discounted_rewards" title="Permalink to this definition">¶</a></dt>
-<dd><dl class="docutils">
-<dt>if the n_step_discounted_rewards were not converted to a batch before, extract them to a batch and then return</dt>
-<dd>the batch</dd>
+<dd><dl class="simple">
+<dt>if the n_step_discounted_rewards were not converted to a batch before, extract them to a batch and then return</dt><dd><p>the batch</p>
+</dd>
 </dl>
 <p>if the n step discounted rewards were not filled, this will raise an exception
 :param expand_dims: add an extra dimension to the total_returns batch
@@ -323,85 +311,69 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
 <code class="descname">next_states</code><span class="sig-paren">(</span><em>fetches: List[str], expand_dims=False</em><span class="sig-paren">)</span> &#x2192; Dict[str, numpy.ndarray]<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.next_states"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.next_states" title="Permalink to this definition">¶</a></dt>
 <dd><p>follow the keys in fetches to extract the corresponding items from the next states in the batch
 if these keys were not already extracted before. return only the values corresponding to those keys</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>fetches</strong> – the keys of the state dictionary to extract</li>
-<li><strong>expand_dims</strong> – add an extra dimension to each of the value batches</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>fetches</strong> – the keys of the state dictionary to extract</p></li>
+<li><p><strong>expand_dims</strong> – add an extra dimension to each of the value batches</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">a dictionary containing a batch of values correponding to each of the given fetches keys</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a dictionary containing a batch of values correponding to each of the given fetches keys</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Batch.rewards">
 <code class="descname">rewards</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.rewards"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.rewards" title="Permalink to this definition">¶</a></dt>
 <dd><p>if the rewards were not converted to a batch before, extract them to a batch and then return the batch</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the rewards batch</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the rewards of the batch</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the rewards batch</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy array containing all the rewards of the batch</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Batch.shuffle">
 <code class="descname">shuffle</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.shuffle"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.shuffle" title="Permalink to this definition">¶</a></dt>
 <dd><p>Shuffle all the transitions in the batch</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.core_types.Batch.size">
 <code class="descname">size</code><a class="headerlink" href="#rl_coach.core_types.Batch.size" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the size of the batch</td>
-</tr>
-</tbody>
-</table>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>the size of the batch</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Batch.slice">
 <code class="descname">slice</code><span class="sig-paren">(</span><em>start</em>, <em>end</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.slice"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.slice" title="Permalink to this definition">¶</a></dt>
 <dd><p>Keep a slice from the batch and discard the rest of the batch</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>start</strong> – the start index in the slice</li>
-<li><strong>end</strong> – the end index in the slice</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>start</strong> – the start index in the slice</p></li>
+<li><p><strong>end</strong> – the end index in the slice</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">None</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -409,21 +381,17 @@ if these keys were not already extracted before. return only the values correspo
 <code class="descname">states</code><span class="sig-paren">(</span><em>fetches: List[str], expand_dims=False</em><span class="sig-paren">)</span> &#x2192; Dict[str, numpy.ndarray]<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.states"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.states" title="Permalink to this definition">¶</a></dt>
 <dd><p>follow the keys in fetches to extract the corresponding items from the states in the batch
 if these keys were not already extracted before. return only the values corresponding to those keys</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>fetches</strong> – the keys of the state dictionary to extract</li>
-<li><strong>expand_dims</strong> – add an extra dimension to each of the value batches</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>fetches</strong> – the keys of the state dictionary to extract</p></li>
+<li><p><strong>expand_dims</strong> – add an extra dimension to each of the value batches</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">a dictionary containing a batch of values correponding to each of the given fetches keys</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a dictionary containing a batch of values correponding to each of the given fetches keys</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -436,23 +404,19 @@ if these keys were not already extracted before. return only the values correspo
 <em class="property">class </em><code class="descclassname">rl_coach.core_types.</code><code class="descname">EnvResponse</code><span class="sig-paren">(</span><em>next_state: Dict[str, numpy.ndarray], reward: Union[int, float, numpy.ndarray], game_over: bool, info: Dict = None, goal: numpy.ndarray = None</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/core_types.html#EnvResponse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.EnvResponse" title="Permalink to this definition">¶</a></dt>
 <dd><p>An env response is a collection containing the information returning from the environment after a single action
 has been performed on it.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>next_state</strong> – The new state that the environment has transitioned into. Assumed to be a dictionary where the
-observation is located at state[‘observation’]</li>
-<li><strong>reward</strong> – The reward received from the environment</li>
-<li><strong>game_over</strong> – A boolean which should be True if the episode terminated after
-the execution of the action.</li>
-<li><strong>info</strong> – any additional info from the environment</li>
-<li><strong>goal</strong> – a goal defined by the environment</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>next_state</strong> – The new state that the environment has transitioned into. Assumed to be a dictionary where the
+observation is located at state[‘observation’]</p></li>
+<li><p><strong>reward</strong> – The reward received from the environment</p></li>
+<li><p><strong>game_over</strong> – A boolean which should be True if the episode terminated after
+the execution of the action.</p></li>
+<li><p><strong>info</strong> – any additional info from the environment</p></li>
+<li><p><strong>goal</strong> – a goal defined by the environment</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -462,62 +426,50 @@ the execution of the action.</li>
 <dt id="rl_coach.core_types.Episode">
 <em class="property">class </em><code class="descclassname">rl_coach.core_types.</code><code class="descname">Episode</code><span class="sig-paren">(</span><em>discount: float = 0.99</em>, <em>bootstrap_total_return_from_old_policy: bool = False</em>, <em>n_step: int = -1</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode" title="Permalink to this definition">¶</a></dt>
 <dd><p>An Episode represents a set of sequential transitions, that end with a terminal state.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>discount</strong> – the discount factor to use when calculating total returns</li>
-<li><strong>bootstrap_total_return_from_old_policy</strong> – should the total return be bootstrapped from the values in the
-memory</li>
-<li><strong>n_step</strong> – the number of future steps to sum the reward over before bootstrapping</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>discount</strong> – the discount factor to use when calculating total returns</p></li>
+<li><p><strong>bootstrap_total_return_from_old_policy</strong> – should the total return be bootstrapped from the values in the
+memory</p></li>
+<li><p><strong>n_step</strong> – the number of future steps to sum the reward over before bootstrapping</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.core_types.Episode.get_first_transition">
 <code class="descname">get_first_transition</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.Transition<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.get_first_transition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.get_first_transition" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the first transition in the episode, or None if there are no transitions available</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The first transition in the episode</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The first transition in the episode</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Episode.get_last_transition">
 <code class="descname">get_last_transition</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.Transition<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.get_last_transition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.get_last_transition" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the last transition in the episode, or None if there are no transition available</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The last transition in the episode</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The last transition in the episode</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Episode.get_transition">
 <code class="descname">get_transition</code><span class="sig-paren">(</span><em>transition_idx: int</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.Transition<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.get_transition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.get_transition" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get a specific transition by its index.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition_idx</strong> – The index of the transition to get</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The transition which is stored in the given index</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>transition_idx</strong> – The index of the transition to get</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>The transition which is stored in the given index</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -526,16 +478,14 @@ memory</li>
 <dd><p>Get the values for some transition attribute from all the transitions in the episode.
 For example, this allows getting the rewards for all the transitions as a list by calling
 get_transitions_attribute(‘reward’)</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>attribute_name</strong> – The name of the attribute to extract from all the transitions</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A list of values from all the transitions according to the attribute given in attribute_name</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>attribute_name</strong> – The name of the attribute to extract from all the transitions</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>A list of values from all the transitions according to the attribute given in attribute_name</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -543,44 +493,36 @@ get_transitions_attribute(‘reward’)</p>
 <code class="descname">insert</code><span class="sig-paren">(</span><em>transition: rl_coach.core_types.Transition</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.insert"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.insert" title="Permalink to this definition">¶</a></dt>
 <dd><p>Insert a new transition to the episode. If the game_over flag in the transition is set to True,
 the episode will be marked as complete.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition</strong> – The new transition to insert to the episode</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>transition</strong> – The new transition to insert to the episode</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Episode.is_empty">
 <code class="descname">is_empty</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; bool<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.is_empty"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.is_empty" title="Permalink to this definition">¶</a></dt>
 <dd><p>Check if the episode is empty</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A boolean value determining if the episode is empty or not</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A boolean value determining if the episode is empty or not</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.core_types.Episode.length">
 <code class="descname">length</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; int<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.length"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.length" title="Permalink to this definition">¶</a></dt>
 <dd><p>Return the length of the episode, which is the number of transitions it holds.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The number of transitions in the episode</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The number of transitions in the episode</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -590,14 +532,11 @@ the episode will be marked as complete.</p>
 The returns will be calculated according to the rewards of each transition, together with the number of steps
 to bootstrap from and the discount factor, as defined by n_step and discount respectively when initializing
 the episode.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -613,25 +552,21 @@ between the agent and the environment. The most basic version should contain the
 (current state, action, reward, next state, game over)
 For imitation learning algorithms, if the reward, next state or game over is not known,
 it is sufficient to store the current state and action taken by the expert.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>state</strong> – The current state. Assumed to be a dictionary where the observation
-is located at state[‘observation’]</li>
-<li><strong>action</strong> – The current action that was taken</li>
-<li><strong>reward</strong> – The reward received from the environment</li>
-<li><strong>next_state</strong> – The next state of the environment after applying the action.
-The next state should be similar to the state in its structure.</li>
-<li><strong>game_over</strong> – A boolean which should be True if the episode terminated after
-the execution of the action.</li>
-<li><strong>info</strong> – A dictionary containing any additional information to be stored in the transition</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>state</strong> – The current state. Assumed to be a dictionary where the observation
+is located at state[‘observation’]</p></li>
+<li><p><strong>action</strong> – The current action that was taken</p></li>
+<li><p><strong>reward</strong> – The reward received from the environment</p></li>
+<li><p><strong>next_state</strong> – The next state of the environment after applying the action.
+The next state should be similar to the state in its structure.</p></li>
+<li><p><strong>game_over</strong> – A boolean which should be True if the episode terminated after
+the execution of the action.</p></li>
+<li><p><strong>info</strong> – A dictionary containing any additional information to be stored in the transition</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -648,7 +583,7 @@ the execution of the action.</li>
        <a href="spaces.html" class="btn btn-neutral float-right" title="Spaces" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="orchestrators/index.html" class="btn btn-neutral" title="Orchestrators" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="orchestrators/index.html" class="btn btn-neutral float-left" title="Orchestrators" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -657,7 +592,7 @@ the execution of the action.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -674,27 +609,16 @@ the execution of the action.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../_static/jquery.js"></script>
-        <script type="text/javascript" src="../_static/underscore.js"></script>
-        <script type="text/javascript" src="../_static/doctools.js"></script>
-        <script type="text/javascript" src="../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/data_stores/index.html
+++ b/docs/components/data_stores/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Data Stores &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Data Stores &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Architectures" href="../architectures/index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -194,14 +197,11 @@
 <em class="property">class </em><code class="descclassname">rl_coach.data_stores.s3_data_store.</code><code class="descname">S3DataStore</code><span class="sig-paren">(</span><em>params: rl_coach.data_stores.s3_data_store.S3DataStoreParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/data_stores/s3_data_store.html#S3DataStore"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.data_stores.s3_data_store.S3DataStore" title="Permalink to this definition">¶</a></dt>
 <dd><p>An implementation of the data store using S3 for storing policy checkpoints when using Coach in distributed mode.
 The policy checkpoints are written by the trainer and read by the rollout worker.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The parameters required to use the S3 data store.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>params</strong> – The parameters required to use the S3 data store.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -212,14 +212,11 @@ The policy checkpoints are written by the trainer and read by the rollout worker
 <em class="property">class </em><code class="descclassname">rl_coach.data_stores.nfs_data_store.</code><code class="descname">NFSDataStore</code><span class="sig-paren">(</span><em>params: rl_coach.data_stores.nfs_data_store.NFSDataStoreParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/data_stores/nfs_data_store.html#NFSDataStore"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.data_stores.nfs_data_store.NFSDataStore" title="Permalink to this definition">¶</a></dt>
 <dd><p>An implementation of data store which uses NFS for storing policy checkpoints when using Coach in distributed mode.
 The policy checkpoints are written by the trainer and read by the rollout worker.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The parameters required to use the NFS data store.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>params</strong> – The parameters required to use the NFS data store.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -236,7 +233,7 @@ The policy checkpoints are written by the trainer and read by the rollout worker
        <a href="../environments/index.html" class="btn btn-neutral float-right" title="Environments" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../architectures/index.html" class="btn btn-neutral" title="Architectures" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../architectures/index.html" class="btn btn-neutral float-left" title="Architectures" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -245,7 +242,7 @@ The policy checkpoints are written by the trainer and read by the rollout worker

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -262,27 +259,16 @@ The policy checkpoints are written by the trainer and read by the rollout worker
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/environments/index.html
+++ b/docs/components/environments/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Environments &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Environments &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Data Stores" href="../data_stores/index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -193,106 +196,84 @@
 <dl class="class">
 <dt id="rl_coach.environments.environment.Environment">
 <em class="property">class </em><code class="descclassname">rl_coach.environments.environment.</code><code class="descname">Environment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, **kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>level</strong> – The environment level. Each environment can have multiple levels</li>
-<li><strong>seed</strong> – a seed for the random number generator of the environment</li>
-<li><strong>frame_skip</strong> – number of frames to skip (while repeating the same action) between each two agent directives</li>
-<li><strong>human_control</strong> – human should control the environment</li>
-<li><strong>visualization_parameters</strong> – a blob of parameters used for visualization of the environment</li>
-<li><strong>**kwargs</strong> – <p>as the class is instantiated by EnvironmentParameters, this is used to support having
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>level</strong> – The environment level. Each environment can have multiple levels</p></li>
+<li><p><strong>seed</strong> – a seed for the random number generator of the environment</p></li>
+<li><p><strong>frame_skip</strong> – number of frames to skip (while repeating the same action) between each two agent directives</p></li>
+<li><p><strong>human_control</strong> – human should control the environment</p></li>
+<li><p><strong>visualization_parameters</strong> – a blob of parameters used for visualization of the environment</p></li>
+<li><p><strong>**kwargs</strong> – <p>as the class is instantiated by EnvironmentParameters, this is used to support having
 additional arguments which will be ignored by this class, but might be used by others</p>
-</li>
+</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 <dl class="attribute">
 <dt id="rl_coach.environments.environment.Environment.action_space">
 <code class="descname">action_space</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.action_space" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the action space of the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the action space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>the action space</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.close">
 <code class="descname">close</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.close"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.close" title="Permalink to this definition">¶</a></dt>
 <dd><p>Clean up steps.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.get_action_from_user">
 <code class="descname">get_action_from_user</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; Union[int, float, numpy.ndarray, List]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_action_from_user"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_action_from_user" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get an action from the user keyboard</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">action index</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>action index</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.get_available_keys">
 <code class="descname">get_available_keys</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; List[Tuple[str, Union[int, float, numpy.ndarray, List]]]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_available_keys"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_available_keys" title="Permalink to this definition">¶</a></dt>
 <dd><p>Return a list of tuples mapping between action names and the keyboard key that triggers them</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a list of tuples mapping between action names and the keyboard key that triggers them</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>a list of tuples mapping between action names and the keyboard key that triggers them</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.get_goal">
 <code class="descname">get_goal</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; Union[None, numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_goal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_goal" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the current goal that the agents needs to achieve in the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The goal</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The goal</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.get_random_action">
 <code class="descname">get_random_action</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; Union[int, float, numpy.ndarray, List]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_random_action"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_random_action" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns an action picked uniformly from the available actions</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a numpy array with a random action</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>a numpy array with a random action</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -300,56 +281,44 @@ additional arguments which will be ignored by this class, but might be used by o
 <code class="descname">get_rendered_image</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_rendered_image"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_rendered_image" title="Permalink to this definition">¶</a></dt>
 <dd><p>Return a numpy array containing the image that will be rendered to the screen.
 This can be different from the observation. For example, mujoco’s observation is a measurements vector.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">numpy array containing the image that will be rendered to the screen</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>numpy array containing the image that will be rendered to the screen</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.environments.environment.Environment.goal_space">
 <code class="descname">goal_space</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.goal_space" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the state space of the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the observation space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>the observation space</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.handle_episode_ended">
 <code class="descname">handle_episode_ended</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.handle_episode_ended"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.handle_episode_ended" title="Permalink to this definition">¶</a></dt>
 <dd><p>End an episode</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.environments.environment.Environment.last_env_response">
 <code class="descname">last_env_response</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.last_env_response" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the last environment response</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a dictionary that contains the state, reward, etc.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>a dictionary that contains the state, reward, etc.</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
@@ -363,76 +332,64 @@ This can be different from the observation. For example, mujoco’s observation
 <dt id="rl_coach.environments.environment.Environment.render">
 <code class="descname">render</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.render"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.render" title="Permalink to this definition">¶</a></dt>
 <dd><p>Call the environment function for rendering to the screen</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.reset_internal_state">
 <code class="descname">reset_internal_state</code><span class="sig-paren">(</span><em>force_environment_reset=False</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.EnvResponse<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.reset_internal_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.reset_internal_state" title="Permalink to this definition">¶</a></dt>
 <dd><p>Reset the environment and all the variable of the wrapper</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>force_environment_reset</strong> – forces environment reset even when the game did not end</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A dictionary containing the observation, reward, done flag, action and measurements</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>force_environment_reset</strong> – forces environment reset even when the game did not end</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>A dictionary containing the observation, reward, done flag, action and measurements</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.set_goal">
 <code class="descname">set_goal</code><span class="sig-paren">(</span><em>goal: Union[None, numpy.ndarray]</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.set_goal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.set_goal" title="Permalink to this definition">¶</a></dt>
 <dd><p>Set the current goal that the agent needs to achieve in the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>goal</strong> – the goal that needs to be achieved</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>goal</strong> – the goal that needs to be achieved</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.environments.environment.Environment.state_space">
 <code class="descname">state_space</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.state_space" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the state space of the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the observation space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>the observation space</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.environments.environment.Environment.step">
 <code class="descname">step</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List]</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.EnvResponse<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.step"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.step" title="Permalink to this definition">¶</a></dt>
 <dd><p>Make a single step in the environment using the given action</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – an action to use for stepping the environment. Should follow the definition of the action space.</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the environment response as returned in get_last_env_response</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – an action to use for stepping the environment. Should follow the definition of the action space.</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the environment response as returned in get_last_env_response</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -444,38 +401,34 @@ This can be different from the observation. For example, mujoco’s observation
 <dl class="class">
 <dt id="rl_coach.environments.control_suite_environment.ControlSuiteEnvironment">
 <em class="property">class </em><code class="descclassname">rl_coach.environments.control_suite_environment.</code><code class="descname">ControlSuiteEnvironment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection</em>, <em>frame_skip: int</em>, <em>visualization_parameters: rl_coach.base_parameters.VisualizationParameters</em>, <em>target_success_rate: float = 1.0</em>, <em>seed: Union[None</em>, <em>int] = None</em>, <em>human_control: bool = False</em>, <em>observation_type: rl_coach.environments.control_suite_environment.ObservationType = &lt;ObservationType.Measurements: 1&gt;</em>, <em>custom_reward_threshold: Union[int</em>, <em>float] = None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/control_suite_environment.html#ControlSuiteEnvironment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.control_suite_environment.ControlSuiteEnvironment" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>level</strong> – (str)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>level</strong> – (str)
 A string representing the control suite level to run. This can also be a LevelSelection object.
-For example, cartpole:swingup.</li>
-<li><strong>frame_skip</strong> – (int)
+For example, cartpole:swingup.</p></li>
+<li><p><strong>frame_skip</strong> – (int)
 The number of frames to skip between any two actions given by the agent. The action will be repeated
-for all the skipped frames.</li>
-<li><strong>visualization_parameters</strong> – (VisualizationParameters)
-The parameters used for visualizing the environment, such as the render flag, storing videos etc.</li>
-<li><strong>target_success_rate</strong> – (float)
-Stop experiment if given target success rate was achieved.</li>
-<li><strong>seed</strong> – (int)
-A seed to use for the random number generator when running the environment.</li>
-<li><strong>human_control</strong> – (bool)
-A flag that allows controlling the environment using the keyboard keys.</li>
-<li><strong>observation_type</strong> – (ObservationType)
+for all the skipped frames.</p></li>
+<li><p><strong>visualization_parameters</strong> – (VisualizationParameters)
+The parameters used for visualizing the environment, such as the render flag, storing videos etc.</p></li>
+<li><p><strong>target_success_rate</strong> – (float)
+Stop experiment if given target success rate was achieved.</p></li>
+<li><p><strong>seed</strong> – (int)
+A seed to use for the random number generator when running the environment.</p></li>
+<li><p><strong>human_control</strong> – (bool)
+A flag that allows controlling the environment using the keyboard keys.</p></li>
+<li><p><strong>observation_type</strong> – (ObservationType)
 An enum which defines which observation to use. The current options are to use:
 * Measurements only - a vector of joint torques and similar measurements
 * Image only - an image of the environment as seen by a camera attached to the simulator
 * Measurements &amp; Image - both type of observations will be returned in the state using the keys
-‘measurements’ and ‘pixels’ respectively.</li>
-<li><strong>custom_reward_threshold</strong> – (float)
-Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</li>
+‘measurements’ and ‘pixels’ respectively.</p></li>
+<li><p><strong>custom_reward_threshold</strong> – (float)
+Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -496,47 +449,39 @@ Allows defining a custom reward that will be used to decide when the agent succe
 <dl class="class">
 <dt id="rl_coach.environments.doom_environment.DoomEnvironment">
 <em class="property">class </em><code class="descclassname">rl_coach.environments.doom_environment.</code><code class="descname">DoomEnvironment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, cameras: List[rl_coach.environments.doom_environment.DoomEnvironment.CameraTypes], target_success_rate: float = 1.0, **kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/doom_environment.html#DoomEnvironment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.doom_environment.DoomEnvironment" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>level</strong> – (str)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>level</strong> – (str)
 A string representing the doom level to run. This can also be a LevelSelection object.
-This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.</li>
-<li><strong>seed</strong> – (int)
-A seed to use for the random number generator when running the environment.</li>
-<li><strong>frame_skip</strong> – (int)
+This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.</p></li>
+<li><p><strong>seed</strong> – (int)
+A seed to use for the random number generator when running the environment.</p></li>
+<li><p><strong>frame_skip</strong> – (int)
 The number of frames to skip between any two actions given by the agent. The action will be repeated
-for all the skipped frames.</li>
-<li><strong>human_control</strong> – (bool)
-A flag that allows controlling the environment using the keyboard keys.</li>
-<li><strong>custom_reward_threshold</strong> – (float)
-Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</li>
-<li><strong>visualization_parameters</strong> – (VisualizationParameters)
-The parameters used for visualizing the environment, such as the render flag, storing videos etc.</li>
-<li><strong>cameras</strong> – <p>(List[CameraTypes])
+for all the skipped frames.</p></li>
+<li><p><strong>human_control</strong> – (bool)
+A flag that allows controlling the environment using the keyboard keys.</p></li>
+<li><p><strong>custom_reward_threshold</strong> – (float)
+Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</p></li>
+<li><p><strong>visualization_parameters</strong> – (VisualizationParameters)
+The parameters used for visualizing the environment, such as the render flag, storing videos etc.</p></li>
+<li><p><strong>cameras</strong> – <p>(List[CameraTypes])
 A list of camera types to use as observation in the state returned from the environment.
 Each camera should be an enum from CameraTypes, and there are several options like an RGB observation,
 a depth map, a segmentation map, and a top down map of the enviornment.</p>
 <blockquote>
-<div><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name" colspan="2">param target_success_rate:</th></tr>
-<tr class="field-odd field"><td>&#160;</td><td class="field-body">(float)
-Stop experiment if given target success rate was achieved.</td>
-</tr>
-</tbody>
-</table>
+<div><dl class="field-list simple">
+<dt class="field-odd">param target_success_rate</dt>
+<dd class="field-odd"><p>(float)
+Stop experiment if given target success rate was achieved.</p>
+</dd>
+</dl>
 </div></blockquote>
-</li>
+</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -557,54 +502,50 @@ Additionally, it can be extended using the API defined by the authors.</p>
 <p>Website: <a class="reference external" href="https://gym.openai.com/">OpenAI Gym</a></p>
 <p>In Coach, we support all the native environments in Gym, along with several extensions such as:</p>
 <ul class="simple">
-<li><a class="reference external" href="https://github.com/openai/roboschool">Roboschool</a>  - a set of environments powered by the PyBullet engine,
-that offer a free alternative to MuJoCo.</li>
-<li><a class="reference external" href="https://github.com/Breakend/gym-extensions">Gym Extensions</a>  - a set of environments that extends Gym for
-auxiliary tasks (multitask learning, transfer learning, inverse reinforcement learning, etc.)</li>
-<li><a class="reference external" href="https://github.com/bulletphysics/bullet3/tree/master/examples/pybullet">PyBullet</a>  - a physics engine that
-includes a set of robotics environments.</li>
+<li><p><a class="reference external" href="https://github.com/openai/roboschool">Roboschool</a>  - a set of environments powered by the PyBullet engine,
+that offer a free alternative to MuJoCo.</p></li>
+<li><p><a class="reference external" href="https://github.com/Breakend/gym-extensions">Gym Extensions</a>  - a set of environments that extends Gym for
+auxiliary tasks (multitask learning, transfer learning, inverse reinforcement learning, etc.)</p></li>
+<li><p><a class="reference external" href="https://github.com/bulletphysics/bullet3/tree/master/examples/pybullet">PyBullet</a>  - a physics engine that
+includes a set of robotics environments.</p></li>
 </ul>
 <dl class="class">
 <dt id="rl_coach.environments.gym_environment.GymEnvironment">
 <em class="property">class </em><code class="descclassname">rl_coach.environments.gym_environment.</code><code class="descname">GymEnvironment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection</em>, <em>frame_skip: int</em>, <em>visualization_parameters: rl_coach.base_parameters.VisualizationParameters</em>, <em>target_success_rate: float = 1.0</em>, <em>additional_simulator_parameters: Dict[str</em>, <em>Any] = {}</em>, <em>seed: Union[None</em>, <em>int] = None</em>, <em>human_control: bool = False</em>, <em>custom_reward_threshold: Union[int</em>, <em>float] = None</em>, <em>random_initialization_steps: int = 1</em>, <em>max_over_num_frames: int = 1</em>, <em>observation_space_type: rl_coach.environments.gym_environment.ObservationSpaceType = None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/gym_environment.html#GymEnvironment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.gym_environment.GymEnvironment" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>level</strong> – (str)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>level</strong> – (str)
 A string representing the gym level to run. This can also be a LevelSelection object.
-For example, BreakoutDeterministic-v0</li>
-<li><strong>frame_skip</strong> – (int)
+For example, BreakoutDeterministic-v0</p></li>
+<li><p><strong>frame_skip</strong> – (int)
 The number of frames to skip between any two actions given by the agent. The action will be repeated
-for all the skipped frames.</li>
-<li><strong>visualization_parameters</strong> – (VisualizationParameters)
-The parameters used for visualizing the environment, such as the render flag, storing videos etc.</li>
-<li><strong>additional_simulator_parameters</strong> – (Dict[str, Any])
+for all the skipped frames.</p></li>
+<li><p><strong>visualization_parameters</strong> – (VisualizationParameters)
+The parameters used for visualizing the environment, such as the render flag, storing videos etc.</p></li>
+<li><p><strong>additional_simulator_parameters</strong> – (Dict[str, Any])
 Any additional parameters that the user can pass to the Gym environment. These parameters should be
-accepted by the __init__ function of the implemented Gym environment.</li>
-<li><strong>seed</strong> – (int)
-A seed to use for the random number generator when running the environment.</li>
-<li><strong>human_control</strong> – (bool)
-A flag that allows controlling the environment using the keyboard keys.</li>
-<li><strong>custom_reward_threshold</strong> – (float)
+accepted by the __init__ function of the implemented Gym environment.</p></li>
+<li><p><strong>seed</strong> – (int)
+A seed to use for the random number generator when running the environment.</p></li>
+<li><p><strong>human_control</strong> – (bool)
+A flag that allows controlling the environment using the keyboard keys.</p></li>
+<li><p><strong>custom_reward_threshold</strong> – (float)
 Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
-If not set, this value will be taken from the Gym environment definition.</li>
-<li><strong>random_initialization_steps</strong> – (int)
+If not set, this value will be taken from the Gym environment definition.</p></li>
+<li><p><strong>random_initialization_steps</strong> – (int)
 The number of random steps that will be taken in the environment after each reset.
-This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.</li>
-<li><strong>max_over_num_frames</strong> – (int)
+This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.</p></li>
+<li><p><strong>max_over_num_frames</strong> – (int)
 This value will be used for merging multiple frames into a single frame by taking the maximum value for each
 of the pixels in the frame. This is particularly used in Atari games, where the frames flicker, and objects
-can be seen in one frame but disappear in the next.</li>
-<li><strong>observation_space_type</strong> – This value will be used for generating observation space. Allows a custom space. Should be one of
+can be seen in one frame but disappear in the next.</p></li>
+<li><p><strong>observation_space_type</strong> – This value will be used for generating observation space. Allows a custom space. Should be one of
 ObservationSpaceType. If not specified, observation space is inferred from the number of dimensions
-of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, PlanarMaps space otherwise.</li>
+of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, PlanarMaps space otherwise.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -621,7 +562,7 @@ of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, Planar
        <a href="../exploration_policies/index.html" class="btn btn-neutral float-right" title="Exploration Policies" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../data_stores/index.html" class="btn btn-neutral" title="Data Stores" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../data_stores/index.html" class="btn btn-neutral float-left" title="Data Stores" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -630,7 +571,7 @@ of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, Planar

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -647,27 +588,16 @@ of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, Planar
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/exploration_policies/index.html
+++ b/docs/components/exploration_policies/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Exploration Policies &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Exploration Policies &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Environments" href="../environments/index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -202,62 +205,62 @@ predefined policy. This is one of the most important aspects of reinforcement le
 tuning to get it right. Coach supports several pre-defined exploration policies, and it can be easily extended with
 custom policies. Note that not all exploration policies are expected to work for both discrete and continuous action
 spaces.</p>
-<table border="1" class="docutils">
+<table class="docutils align-center">
 <colgroup>
-<col width="35%" />
-<col width="37%" />
-<col width="29%" />
+<col style="width: 35%" />
+<col style="width: 37%" />
+<col style="width: 29%" />
 </colgroup>
-<thead valign="bottom">
-<tr class="row-odd"><th class="head">Exploration Policy</th>
-<th class="head">Discrete Action Space</th>
-<th class="head">Box Action Space</th>
+<thead>
+<tr class="row-odd"><th class="head"><p>Exploration Policy</p></th>
+<th class="head"><p>Discrete Action Space</p></th>
+<th class="head"><p>Box Action Space</p></th>
 </tr>
 </thead>
-<tbody valign="top">
-<tr class="row-even"><td>AdditiveNoise</td>
-<td><span class="red">X</span></td>
-<td><span class="green">V</span></td>
+<tbody>
+<tr class="row-even"><td><p>AdditiveNoise</p></td>
+<td><p><span class="red">X</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-odd"><td>Boltzmann</td>
-<td><span class="green">V</span></td>
-<td><span class="red">X</span></td>
+<tr class="row-odd"><td><p>Boltzmann</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="red">X</span></p></td>
 </tr>
-<tr class="row-even"><td>Bootstrapped</td>
-<td><span class="green">V</span></td>
-<td><span class="red">X</span></td>
+<tr class="row-even"><td><p>Bootstrapped</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="red">X</span></p></td>
 </tr>
-<tr class="row-odd"><td>Categorical</td>
-<td><span class="green">V</span></td>
-<td><span class="red">X</span></td>
+<tr class="row-odd"><td><p>Categorical</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="red">X</span></p></td>
 </tr>
-<tr class="row-even"><td>ContinuousEntropy</td>
-<td><span class="red">X</span></td>
-<td><span class="green">V</span></td>
+<tr class="row-even"><td><p>ContinuousEntropy</p></td>
+<td><p><span class="red">X</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-odd"><td>EGreedy</td>
-<td><span class="green">V</span></td>
-<td><span class="green">V</span></td>
+<tr class="row-odd"><td><p>EGreedy</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-even"><td>Greedy</td>
-<td><span class="green">V</span></td>
-<td><span class="green">V</span></td>
+<tr class="row-even"><td><p>Greedy</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-odd"><td>OUProcess</td>
-<td><span class="red">X</span></td>
-<td><span class="green">V</span></td>
+<tr class="row-odd"><td><p>OUProcess</p></td>
+<td><p><span class="red">X</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-even"><td>ParameterNoise</td>
-<td><span class="green">V</span></td>
-<td><span class="green">V</span></td>
+<tr class="row-even"><td><p>ParameterNoise</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-odd"><td>TruncatedNormal</td>
-<td><span class="red">X</span></td>
-<td><span class="green">V</span></td>
+<tr class="row-odd"><td><p>TruncatedNormal</p></td>
+<td><p><span class="red">X</span></p></td>
+<td><p><span class="green">V</span></p></td>
 </tr>
-<tr class="row-even"><td>UCB</td>
-<td><span class="green">V</span></td>
-<td><span class="red">X</span></td>
+<tr class="row-even"><td><p>UCB</p></td>
+<td><p><span class="green">V</span></p></td>
+<td><p><span class="red">X</span></p></td>
 </tr>
 </tbody>
 </table>
@@ -268,14 +271,11 @@ spaces.</p>
 <em class="property">class </em><code class="descclassname">rl_coach.exploration_policies.exploration_policy.</code><code class="descname">ExplorationPolicy</code><span class="sig-paren">(</span><em>action_space: rl_coach.spaces.ActionSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/exploration_policies/exploration_policy.html#ExplorationPolicy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.exploration_policies.exploration_policy.ExplorationPolicy" title="Permalink to this definition">¶</a></dt>
 <dd><p>An exploration policy takes the predicted actions or action values from the agent, and selects the action to
 actually apply to the environment using some predefined algorithm.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.exploration_policies.exploration_policy.ExplorationPolicy.change_phase">
 <code class="descname">change_phase</code><span class="sig-paren">(</span><em>phase</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/exploration_policies/exploration_policy.html#ExplorationPolicy.change_phase"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.exploration_policies.exploration_policy.ExplorationPolicy.change_phase" title="Permalink to this definition">¶</a></dt>
@@ -323,20 +323,16 @@ can be given in two different ways:
 1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
 2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
 be the mean of the action, and 2nd is assumed to be its standard deviation.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
-of the action space</li>
-<li><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
+of the action space</p></li>
+<li><p><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -349,18 +345,14 @@ of the action space</li>
 actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
 into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
 An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>temperature_schedule</strong> – the schedule for the temperature parameter of the softmax</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>temperature_schedule</strong> – the schedule for the temperature parameter of the softmax</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -375,26 +367,22 @@ values for all the possible actions. For each episode, a single head is selected
 to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
 predictions.</p>
 <div class="admonition note">
-<p class="first admonition-title">Note</p>
-<p class="last">This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
+<p class="admonition-title">Note</p>
+<p>This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
 since it requires the agent to have a network with multiple heads.</p>
 </div>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>epsilon_schedule</strong> – a schedule for the epsilon values</li>
-<li><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</li>
-<li><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
-if the e-greedy is used for a continuous policy</li>
-<li><strong>architecture_num_q_heads</strong> – the number of q heads to select from</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>epsilon_schedule</strong> – a schedule for the epsilon values</p></li>
+<li><p><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</p></li>
+<li><p><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
+if the e-greedy is used for a continuous policy</p></li>
+<li><p><strong>architecture_num_q_heads</strong> – the number of q heads to select from</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -407,14 +395,11 @@ if the e-greedy is used for a continuous policy</li>
 represent a probability distribution over the action, from which a single action will be sampled.
 In evaluation, the action that has the highest probability will be selected. This is particularly useful for
 actor-critic schemes, where the actors output is a probability distribution over the actions.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -429,24 +414,20 @@ implemented by adding a regularization factor to the network loss, which regular
 This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
 is implemented as part of the head.</p>
 <div class="admonition warning">
-<p class="first admonition-title">Warning</p>
-<p class="last">This exploration policy expects the agent or the network to implement the exploration functionality.
+<p class="admonition-title">Warning</p>
+<p>This exploration policy expects the agent or the network to implement the exploration functionality.
 There are only a few heads that actually are relevant and implement the entropy regularization factor.</p>
 </div>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
-of the action space</li>
-<li><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
+of the action space</p></li>
+<li><p><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -464,21 +445,17 @@ In evaluation, a different epsilon value can be specified.</p>
 it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
 given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
 always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>epsilon_schedule</strong> – a schedule for the epsilon values</li>
-<li><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</li>
-<li><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
-if the e-greedy is used for a continuous policy</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>epsilon_schedule</strong> – a schedule for the epsilon values</p></li>
+<li><p><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</p></li>
+<li><p><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
+if the e-greedy is used for a continuous policy</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -490,14 +467,11 @@ if the e-greedy is used for a continuous policy</li>
 <dd><p>The Greedy exploration policy is intended for both discrete and continuous action spaces.
 For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
 For continuous action spaces, it always return the exact action, as it was given by the agent.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -509,14 +483,11 @@ For continuous action spaces, it always return the exact action, as it was given
 <dd><p>OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
 an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
 the samples are correlated between consequent time steps.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -531,14 +502,11 @@ The noisy layers have both weight means and weight standard deviations, and for
 the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
 values.</p>
 <p>Warning: currently supported only by DQN variants</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -555,20 +523,16 @@ wo different ways:
 be the mean of the action, and 2nd is assumed to be its standard deviation.
 When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
 is within the bounds.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
-of the action space</li>
-<li><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
+of the action space</p></li>
+<li><p><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -584,23 +548,19 @@ It then updates the action value estimates to by mean(actions)+lambda*stdev(acti
 given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
 and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
 the outcome from those actions to be.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>action_space</strong> – the action space used by the environment</li>
-<li><strong>epsilon_schedule</strong> – a schedule for the epsilon values</li>
-<li><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</li>
-<li><strong>architecture_num_q_heads</strong> – the number of q heads to select from</li>
-<li><strong>lamb</strong> – lambda coefficient for taking the standard deviation into account</li>
-<li><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
-if the e-greedy is used for a continuous policy</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
+<li><p><strong>epsilon_schedule</strong> – a schedule for the epsilon values</p></li>
+<li><p><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</p></li>
+<li><p><strong>architecture_num_q_heads</strong> – the number of q heads to select from</p></li>
+<li><p><strong>lamb</strong> – lambda coefficient for taking the standard deviation into account</p></li>
+<li><p><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
+if the e-greedy is used for a continuous policy</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -617,7 +577,7 @@ if the e-greedy is used for a continuous policy</li>
        <a href="../filters/index.html" class="btn btn-neutral float-right" title="Filters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../environments/index.html" class="btn btn-neutral" title="Environments" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../environments/index.html" class="btn btn-neutral float-left" title="Environments" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -626,7 +586,7 @@ if the e-greedy is used for a continuous policy</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -643,27 +603,16 @@ if the e-greedy is used for a continuous policy</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/filters/index.html
+++ b/docs/components/filters/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Filters &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Filters &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Exploration Policies" href="../exploration_policies/index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -197,13 +200,13 @@
 <p>Filters are a mechanism in Coach that allows doing pre-processing and post-processing of the internal agent information.
 There are two filter categories -</p>
 <ul class="simple">
-<li><strong>Input filters</strong> - these are filters that process the information passed <strong>into</strong> the agent from the environment.
+<li><p><strong>Input filters</strong> - these are filters that process the information passed <strong>into</strong> the agent from the environment.
 This information includes the observation and the reward. Input filters therefore allow rescaling observations,
-normalizing rewards, stack observations, etc.</li>
-<li><strong>Output filters</strong> - these are filters that process the information going <strong>out</strong> of the agent into the environment.
+normalizing rewards, stack observations, etc.</p></li>
+<li><p><strong>Output filters</strong> - these are filters that process the information going <strong>out</strong> of the agent into the environment.
 This information includes the action the agent chooses to take. Output filters therefore allow conversion of
 actions from one space into another. For example, the agent can take <span class="math notranslate nohighlight">\(N\)</span> discrete actions, that will be mapped by
-the output filter onto <span class="math notranslate nohighlight">\(N\)</span> continuous actions.</li>
+the output filter onto <span class="math notranslate nohighlight">\(N\)</span> continuous actions.</p></li>
 </ul>
 <p>Filters can be stacked on top of each other in order to build complex processing flows of the inputs or outputs.</p>
 <a class="reference internal image-reference" href="../../_images/filters.png"><img alt="../../_images/filters.png" class="align-center" src="../../_images/filters.png" style="width: 350px;" /></a>
@@ -220,7 +223,7 @@ the output filter onto <span class="math notranslate nohighlight">\(N\)</span> c
        <a href="input_filters.html" class="btn btn-neutral float-right" title="Input Filters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../exploration_policies/index.html" class="btn btn-neutral" title="Exploration Policies" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../exploration_policies/index.html" class="btn btn-neutral float-left" title="Exploration Policies" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -229,7 +232,7 @@ the output filter onto <span class="math notranslate nohighlight">\(N\)</span> c

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -246,27 +249,16 @@ the output filter onto <span class="math notranslate nohighlight">\(N\)</span> c
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/filters/input_filters.html
+++ b/docs/components/filters/input_filters.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Input Filters &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Input Filters &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Filters" href="index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -223,18 +226,14 @@
 For example, if the observation consists of measurements in an arbitrary range,
 and we want to control the minimum and maximum values of these observations,
 we can define a range and clip the values of the measurements.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>clipping_low</strong> – The minimum value to allow after normalizing the observation</li>
-<li><strong>clipping_high</strong> – The maximum value to allow after normalizing the observation</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>clipping_low</strong> – The minimum value to allow after normalizing the observation</p></li>
+<li><p><strong>clipping_high</strong> – The maximum value to allow after normalizing the observation</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -246,20 +245,16 @@ we can define a range and clip the values of the measurements.</p>
 <dd><p>Crops the size of the observation to a given crop window. For example, in Atari, the
 observations are images with a shape of 210x160. Usually, we will want to crop the size of the observation to a
 square of 160x160 before rescaling them.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>crop_low</strong> – a vector where each dimension describes the start index for cropping the observation in the
-corresponding dimension. a negative value of -1 will be mapped to the max size</li>
-<li><strong>crop_high</strong> – a vector where each dimension describes the end index for cropping the observation in the
-corresponding dimension. a negative value of -1 will be mapped to the max size</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>crop_low</strong> – a vector where each dimension describes the start index for cropping the observation in the
+corresponding dimension. a negative value of -1 will be mapped to the max size</p></li>
+<li><p><strong>crop_high</strong> – a vector where each dimension describes the end index for cropping the observation in the
+corresponding dimension. a negative value of -1 will be mapped to the max size</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -270,18 +265,14 @@ corresponding dimension. a negative value of -1 will be mapped to the max size</
 <em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationMoveAxisFilter</code><span class="sig-paren">(</span><em>axis_origin: int = None</em>, <em>axis_target: int = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_move_axis_filter.html#ObservationMoveAxisFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationMoveAxisFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Reorders the axes of the observation. This can be useful when the observation is an
 image, and we want to move the channel axis to be the last axis instead of the first axis.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>axis_origin</strong> – The axis to move</li>
-<li><strong>axis_target</strong> – Where to move the selected axis to</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>axis_origin</strong> – The axis to move</p></li>
+<li><p><strong>axis_target</strong> – Where to move the selected axis to</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -293,18 +284,14 @@ image, and we want to move the channel axis to be the last axis instead of the f
 <dd><p>Normalizes the observation values with a running mean and standard deviation of
 all the observations seen so far. The normalization is performed element-wise. Additionally, when working with
 multiple workers, the statistics used for the normalization operation are accumulated over all the workers.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>clip_min</strong> – The minimum value to allow after normalizing the observation</li>
-<li><strong>clip_max</strong> – The maximum value to allow after normalizing the observation</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>clip_min</strong> – The minimum value to allow after normalizing the observation</p></li>
+<li><p><strong>clip_max</strong> – The maximum value to allow after normalizing the observation</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -319,18 +306,14 @@ measurements, but you want the agent to only see some of the measurements and no
 For example, the CARLA environment extracts multiple measurements that can be used by the agent, such as
 speed and location. If we want to only use the speed, it can be done using this filter.
 This will currently work only for VectorObservationSpace observations</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>part_names</strong> – A list of part names to reduce</li>
-<li><strong>reduction_method</strong> – A reduction method to use - keep or discard the given parts</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>part_names</strong> – A list of part names to reduce</p></li>
+<li><p><strong>reduction_method</strong> – A reduction method to use - keep or discard the given parts</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -338,22 +321,14 @@ This will currently work only for VectorObservationSpace observations</p>
 <h3>ObservationRescaleSizeByFactorFilter<a class="headerlink" href="#observationrescalesizebyfactorfilter" title="Permalink to this headline">¶</a></h3>
 <dl class="class">
 <dt id="rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter">
-<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleSizeByFactorFilter</code><span class="sig-paren">(</span><em>rescale_factor: float</em>, <em>rescaling_interpolation_type: rl_coach.filters.observation.observation_rescale_size_by_factor_filter.RescaleInterpolationType</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_size_by_factor_filter.html#ObservationRescaleSizeByFactorFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter" title="Permalink to this definition">¶</a></dt>
+<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleSizeByFactorFilter</code><span class="sig-paren">(</span><em>rescale_factor: float</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_size_by_factor_filter.html#ObservationRescaleSizeByFactorFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Rescales an image observation by some factor. For example, the image size
-can be reduced by a factor of 2.
-Warning: this requires the input observation to be of type uint8 due to scipy requirements!</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>rescale_factor</strong> – the factor by which the observation will be rescaled</li>
-<li><strong>rescaling_interpolation_type</strong> – the interpolation type for rescaling</li>
-</ul>
-</td>
-</tr>
-</tbody>
-</table>
+can be reduced by a factor of 2.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>rescale_factor</strong> – the factor by which the observation will be rescaled</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -361,22 +336,15 @@ Warning: this requires the input observation to be of type uint8 due to scipy re
 <h3>ObservationRescaleToSizeFilter<a class="headerlink" href="#observationrescaletosizefilter" title="Permalink to this headline">¶</a></h3>
 <dl class="class">
 <dt id="rl_coach.filters.observation.ObservationRescaleToSizeFilter">
-<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleToSizeFilter</code><span class="sig-paren">(</span><em>output_observation_space: rl_coach.spaces.PlanarMapsObservationSpace</em>, <em>rescaling_interpolation_type: rl_coach.filters.observation.observation_rescale_to_size_filter.RescaleInterpolationType = &lt;RescaleInterpolationType.BILINEAR: 'bilinear'&gt;</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_to_size_filter.html#ObservationRescaleToSizeFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleToSizeFilter" title="Permalink to this definition">¶</a></dt>
+<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleToSizeFilter</code><span class="sig-paren">(</span><em>output_observation_space: rl_coach.spaces.PlanarMapsObservationSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_to_size_filter.html#ObservationRescaleToSizeFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleToSizeFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Rescales an image observation to a given size. The target size does not
 necessarily keep the aspect ratio of the original observation.
 Warning: this requires the input observation to be of type uint8 due to scipy requirements!</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>output_observation_space</strong> – the output observation space</li>
-<li><strong>rescaling_interpolation_type</strong> – the interpolation type for rescaling</li>
-</ul>
-</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>output_observation_space</strong> – the output observation space</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -398,14 +366,11 @@ The channels axis is assumed to be the last axis</p>
 <dt id="rl_coach.filters.observation.ObservationSqueezeFilter">
 <em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationSqueezeFilter</code><span class="sig-paren">(</span><em>axis: int = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_squeeze_filter.html#ObservationSqueezeFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationSqueezeFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Removes redundant axes from the observation, which are axes with a dimension of 1.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>axis</strong> – Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>axis</strong> – Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -423,18 +388,14 @@ The filter adds an additional dimension to the output observation.</p>
 <p>Warning!!! The filter replaces the observation with a LazyStack object, so no filters should be
 applied after this filter. applying more filters will cause the LazyStack object to be converted to a numpy array
 and increase the memory footprint.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>stack_size</strong> – the number of previous observations in the stack</li>
-<li><strong>stacking_axis</strong> – the axis on which to stack the observation on</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>stack_size</strong> – the number of previous observations in the stack</p></li>
+<li><p><strong>stacking_axis</strong> – the axis on which to stack the observation on</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -446,18 +407,14 @@ and increase the memory footprint.</p>
 <dd><p>Converts a floating point observation into an unsigned int 8 bit observation. This is
 mostly useful for reducing memory consumption and is usually used for image observations. The filter will first
 spread the observation values over the range 0-255 and then discretize them into integer values.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>input_low</strong> – The lowest value currently present in the observation</li>
-<li><strong>input_high</strong> – The highest value currently present in the observation</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>input_low</strong> – The lowest value currently present in the observation</p></li>
+<li><p><strong>input_high</strong> – The highest value currently present in the observation</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -471,18 +428,14 @@ spread the observation values over the range 0-255 and then discretize them into
 <em class="property">class </em><code class="descclassname">rl_coach.filters.reward.</code><code class="descname">RewardClippingFilter</code><span class="sig-paren">(</span><em>clipping_low: float = -inf</em>, <em>clipping_high: float = inf</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/reward/reward_clipping_filter.html#RewardClippingFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.reward.RewardClippingFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Clips the reward values into a given range. For example, in DQN, the Atari rewards are
 clipped into the range -1 and 1 in order to control the scale of the returns.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>clipping_low</strong> – The low threshold for reward clipping</li>
-<li><strong>clipping_high</strong> – The high threshold for reward clipping</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>clipping_low</strong> – The low threshold for reward clipping</p></li>
+<li><p><strong>clipping_high</strong> – The high threshold for reward clipping</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -494,18 +447,14 @@ clipped into the range -1 and 1 in order to control the scale of the returns.</p
 <dd><p>Normalizes the reward values with a running mean and standard deviation of
 all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation
 are accumulated over all the workers.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>clip_min</strong> – The minimum value to allow after normalizing the reward</li>
-<li><strong>clip_max</strong> – The maximum value to allow after normalizing the reward</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>clip_min</strong> – The minimum value to allow after normalizing the reward</p></li>
+<li><p><strong>clip_max</strong> – The maximum value to allow after normalizing the reward</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -516,14 +465,11 @@ are accumulated over all the workers.</p>
 <em class="property">class </em><code class="descclassname">rl_coach.filters.reward.</code><code class="descname">RewardRescaleFilter</code><span class="sig-paren">(</span><em>rescale_factor: float</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/reward/reward_rescale_filter.html#RewardRescaleFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.reward.RewardRescaleFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Rescales the reward by a given factor. Rescaling the rewards of the environment has been
 observed to have a large effect (negative or positive) on the behavior of the learning process.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rescale_factor</strong> – The reward rescaling factor by which the reward will be multiplied</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>rescale_factor</strong> – The reward rescaling factor by which the reward will be multiplied</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -541,7 +487,7 @@ observed to have a large effect (negative or positive) on the behavior of the le
        <a href="output_filters.html" class="btn btn-neutral float-right" title="Output Filters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="index.html" class="btn btn-neutral" title="Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="index.html" class="btn btn-neutral float-left" title="Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -550,7 +496,7 @@ observed to have a large effect (negative or positive) on the behavior of the le

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -567,27 +513,16 @@ observed to have a large effect (negative or positive) on the behavior of the le
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/filters/output_filters.html
+++ b/docs/components/filters/output_filters.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Output Filters &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Output Filters &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Input Filters" href="input_filters.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -203,18 +206,14 @@ as choosing sub-boxes in a given box. For example, consider an image of size 100
 a crop window of size 20x20 to attend to in the image. AttentionDiscretization allows discretizing the possible crop
 windows to choose into a finite number of options, and map a discrete action space into those crop windows.</p>
 <p>Warning! this will currently only work for attention spaces with 2 dimensions.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_bins_per_dimension</strong> – Number of discrete bins to use for each dimension of the action space</li>
-<li><strong>force_int_bins</strong> – If set to True, all the bins will represent integer coordinates in space.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_bins_per_dimension</strong> – Number of discrete bins to use for each dimension of the action space</p></li>
+<li><p><strong>force_int_bins</strong> – If set to True, all the bins will represent integer coordinates in space.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <img alt="../../_images/attention_discretization.png" class="align-center" src="../../_images/attention_discretization.png" />
@@ -227,21 +226,17 @@ original continuous action space is uniformly separated into the given number of
 action index. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
 For example, if the original actions space is between -1 and 1 and 5 bins were selected, the new action
 space will consist of 5 actions mapped to -1, -0.5, 0, 0.5 and 1.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_bins_per_dimension</strong> – The number of bins to use for each dimension of the target action space.
-The bins will be spread out uniformly over this space</li>
-<li><strong>force_int_bins</strong> – force the bins to represent only integer actions. for example, if the action space is in
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_bins_per_dimension</strong> – The number of bins to use for each dimension of the target action space.
+The bins will be spread out uniformly over this space</p></li>
+<li><p><strong>force_int_bins</strong> – force the bins to represent only integer actions. for example, if the action space is in
 the range 0-10 and there are 5 bins, then the bins will be placed at 0, 2, 5, 7, 10,
-instead of 0, 2.5, 5, 7.5, 10.</li>
+instead of 0, 2.5, 5, 7.5, 10.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <img alt="../../_images/box_discretization.png" class="align-center" src="../../_images/box_discretization.png" />
@@ -252,18 +247,14 @@ instead of 0, 2.5, 5, 7.5, 10.</li>
 if the original action space is between -1 and 1, then this filter can be used in order to constrain the agent actions
 to the range 0 and 1 instead. This essentially masks the range -1 and 0 from the agent.
 The resulting action space will be shifted and will always start from 0 and have the size of the unmasked area.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>masked_target_space_low</strong> – the lowest values that can be chosen in the target action space</li>
-<li><strong>masked_target_space_high</strong> – the highest values that can be chosen in the target action space</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>masked_target_space_low</strong> – the lowest values that can be chosen in the target action space</p></li>
+<li><p><strong>masked_target_space_high</strong> – the highest values that can be chosen in the target action space</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <img alt="../../_images/box_masking.png" class="align-center" src="../../_images/box_masking.png" />
@@ -275,18 +266,14 @@ with a MultiSelect action space (select multiple actions at the same time, such
 MultiSelect actions. If we want the agent to be able to select only 5 of those actions by their index (0-4), we can
 map a discrete action space with 5 actions into the 5 selected MultiSelect actions. This will both allow the agent to
 use regular discrete actions, and mask 3 of the actions from the agent.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>target_actions</strong> – A partial list of actions from the target space to map to.</li>
-<li><strong>descriptions</strong> – a list of descriptions of each of the actions</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>target_actions</strong> – A partial list of actions from the target space to map to.</p></li>
+<li><p><strong>descriptions</strong> – a list of descriptions of each of the actions</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <img alt="../../_images/partial_discrete_action_space_map.png" class="align-center" src="../../_images/partial_discrete_action_space_map.png" />
@@ -309,18 +296,14 @@ environment consists of continuous actions between 0 and 1, and we want the agen
 the LinearBoxToBoxMap can be used to map the range -1 and 1 to the range 0 and 1 in a linear way. This means that the
 action -1 will be mapped to 0, the action 1 will be mapped to 1, and the rest of the actions will be linearly mapped
 between those values.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>input_space_low</strong> – the low values of the desired action space</li>
-<li><strong>input_space_high</strong> – the high values of the desired action space</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>input_space_low</strong> – the low values of the desired action space</p></li>
+<li><p><strong>input_space_high</strong> – the high values of the desired action space</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <img alt="../../_images/linear_box_to_box_map.png" class="align-center" src="../../_images/linear_box_to_box_map.png" />
@@ -338,7 +321,7 @@ between those values.</p>
        <a href="../memories/index.html" class="btn btn-neutral float-right" title="Memories" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="input_filters.html" class="btn btn-neutral" title="Input Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="input_filters.html" class="btn btn-neutral float-left" title="Input Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -347,7 +330,7 @@ between those values.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -364,27 +347,16 @@ between those values.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/memories/index.html
+++ b/docs/components/memories/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Memories &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Memories &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Output Filters" href="../filters/output_filters.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -210,14 +213,11 @@
 <dd><p>A replay buffer that stores episodes of transitions. The additional structure allows performing various
 calculations of total return and other values that depend on the sequential behavior of the transitions
 in the episode.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -227,22 +227,18 @@ in the episode.</p>
 <dt id="rl_coach.memories.episodic.EpisodicHindsightExperienceReplay">
 <em class="property">class </em><code class="descclassname">rl_coach.memories.episodic.</code><code class="descname">EpisodicHindsightExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/episodic/episodic_hindsight_experience_replay.html#EpisodicHindsightExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.episodic.EpisodicHindsightExperienceReplay" title="Permalink to this definition">¶</a></dt>
 <dd><p>Implements Hindsight Experience Replay as described in the following paper: <a class="reference external" href="https://arxiv.org/pdf/1707.01495.pdf">https://arxiv.org/pdf/1707.01495.pdf</a></p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</li>
-<li><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
-for each actual transition</li>
-<li><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
-hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
-<li><strong>goals_space</strong> – A GoalsSpace which defines the base properties of the goals space</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</p></li>
+<li><p><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
+for each actual transition</p></li>
+<li><p><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
+hindsight transitions. Should be one of HindsightGoalSelectionMethod</p></li>
+<li><p><strong>goals_space</strong> – A GoalsSpace which defines the base properties of the goals space</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -253,23 +249,19 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
 <em class="property">class </em><code class="descclassname">rl_coach.memories.episodic.</code><code class="descname">EpisodicHRLHindsightExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/episodic/episodic_hrl_hindsight_experience_replay.html#EpisodicHRLHindsightExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay" title="Permalink to this definition">¶</a></dt>
 <dd><p>Implements HRL Hindsight Experience Replay as described in the following paper:  <a class="reference external" href="https://arxiv.org/abs/1805.08180">https://arxiv.org/abs/1805.08180</a></p>
 <p>This is the memory you should use if you want a shared hindsight experience replay buffer between multiple workers</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</li>
-<li><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
-for each actual transition</li>
-<li><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
-hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
-<li><strong>goals_space</strong> – A GoalsSpace  which defines the properties of the goals</li>
-<li><strong>do_action_hindsight</strong> – Replace the action (sub-goal) given to a lower layer, with the actual achieved goal</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</p></li>
+<li><p><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
+for each actual transition</p></li>
+<li><p><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
+hindsight transitions. Should be one of HindsightGoalSelectionMethod</p></li>
+<li><p><strong>goals_space</strong> – A GoalsSpace  which defines the properties of the goals</p></li>
+<li><p><strong>do_action_hindsight</strong> – Replace the action (sub-goal) given to a lower layer, with the actual achieved goal</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -289,21 +281,17 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
 <dl class="class">
 <dt id="rl_coach.memories.non_episodic.BalancedExperienceReplay">
 <em class="property">class </em><code class="descclassname">rl_coach.memories.non_episodic.</code><code class="descname">BalancedExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True, num_classes: int = 0, state_key_with_the_class_index: Any = 'class'</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/non_episodic/balanced_experience_replay.html#BalancedExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.non_episodic.BalancedExperienceReplay" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</li>
-<li><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</li>
-<li><strong>num_classes</strong> – the number of classes in the replayed data</li>
-<li><strong>state_key_with_the_class_index</strong> – the class index is assumed to be a value in the state dictionary.
-this parameter determines the key to retrieve the class index value</li>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p></li>
+<li><p><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</p></li>
+<li><p><strong>num_classes</strong> – the number of classes in the replayed data</p></li>
+<li><p><strong>state_key_with_the_class_index</strong> – the class index is assumed to be a value in the state dictionary.
+this parameter determines the key to retrieve the class index value</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -321,18 +309,14 @@ this parameter determines the key to retrieve the class index value</li>
 <dt id="rl_coach.memories.non_episodic.ExperienceReplay">
 <em class="property">class </em><code class="descclassname">rl_coach.memories.non_episodic.</code><code class="descname">ExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/non_episodic/experience_replay.html#ExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.non_episodic.ExperienceReplay" title="Permalink to this definition">¶</a></dt>
 <dd><p>A regular replay buffer which stores transition without any additional structure</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</li>
-<li><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p></li>
+<li><p><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -343,21 +327,17 @@ this parameter determines the key to retrieve the class index value</li>
 <em class="property">class </em><code class="descclassname">rl_coach.memories.non_episodic.</code><code class="descname">PrioritizedExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], alpha: float = 0.6, beta: rl_coach.schedules.Schedule = &lt;rl_coach.schedules.ConstantSchedule object&gt;, epsilon: float = 1e-06, allow_duplicates_in_batch_sampling: bool = True</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/non_episodic/prioritized_experience_replay.html#PrioritizedExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.non_episodic.PrioritizedExperienceReplay" title="Permalink to this definition">¶</a></dt>
 <dd><p>This is the proportional sampling variant of the prioritized experience replay as described
 in <a class="reference external" href="https://arxiv.org/pdf/1511.05952.pdf">https://arxiv.org/pdf/1511.05952.pdf</a>.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</li>
-<li><strong>alpha</strong> – the alpha prioritization coefficient</li>
-<li><strong>beta</strong> – the beta parameter used for importance sampling</li>
-<li><strong>epsilon</strong> – a small value added to the priority of each transition</li>
-<li><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p></li>
+<li><p><strong>alpha</strong> – the alpha prioritization coefficient</p></li>
+<li><p><strong>beta</strong> – the beta parameter used for importance sampling</p></li>
+<li><p><strong>epsilon</strong> – a small value added to the priority of each transition</p></li>
+<li><p><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -385,7 +365,7 @@ are constructed on top of.</p>
        <a href="../memory_backends/index.html" class="btn btn-neutral float-right" title="Memory Backends" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../filters/output_filters.html" class="btn btn-neutral" title="Output Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../filters/output_filters.html" class="btn btn-neutral float-left" title="Output Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -394,7 +374,7 @@ are constructed on top of.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -411,27 +391,16 @@ are constructed on top of.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/memory_backends/index.html
+++ b/docs/components/memory_backends/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Memory Backends &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Memory Backends &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Memories" href="../memories/index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -193,14 +196,11 @@
 <em class="property">class </em><code class="descclassname">rl_coach.memories.backend.redis.</code><code class="descname">RedisPubSubBackend</code><span class="sig-paren">(</span><em>params: rl_coach.memories.backend.redis.RedisPubSubMemoryBackendParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/backend/redis.html#RedisPubSubBackend"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.backend.redis.RedisPubSubBackend" title="Permalink to this definition">¶</a></dt>
 <dd><p>A memory backend which transfers the experiences from the rollout to the training worker using Redis Pub/Sub in
 Coach when distributed mode is used.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The Redis parameters to be used with this Redis Pub/Sub instance.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>params</strong> – The Redis parameters to be used with this Redis Pub/Sub instance.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -217,7 +217,7 @@ Coach when distributed mode is used.</p>
        <a href="../orchestrators/index.html" class="btn btn-neutral float-right" title="Orchestrators" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../memories/index.html" class="btn btn-neutral" title="Memories" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../memories/index.html" class="btn btn-neutral float-left" title="Memories" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -226,7 +226,7 @@ Coach when distributed mode is used.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -243,27 +243,16 @@ Coach when distributed mode is used.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/orchestrators/index.html
+++ b/docs/components/orchestrators/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Orchestrators &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Orchestrators &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Memory Backends" href="../memory_backends/index.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -193,14 +196,11 @@
 <em class="property">class </em><code class="descclassname">rl_coach.orchestrators.kubernetes_orchestrator.</code><code class="descname">Kubernetes</code><span class="sig-paren">(</span><em>params: rl_coach.orchestrators.kubernetes_orchestrator.KubernetesParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/orchestrators/kubernetes_orchestrator.html#Kubernetes"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes" title="Permalink to this definition">¶</a></dt>
 <dd><p>An orchestrator implmentation which uses Kubernetes to deploy the components such as training and rollout workers
 and Redis Pub/Sub in Coach when used in the distributed mode.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The Kubernetes parameters which are used for deploying the components in Coach. These parameters</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>params</strong> – The Kubernetes parameters which are used for deploying the components in Coach. These parameters</p>
+</dd>
+</dl>
 <p>include namespace and kubeconfig.</p>
 </dd></dl>

@@ -218,7 +218,7 @@ and Redis Pub/Sub in Coach when used in the distributed mode.</p>
        <a href="../core_types.html" class="btn btn-neutral float-right" title="Core Types" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../memory_backends/index.html" class="btn btn-neutral" title="Memory Backends" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../memory_backends/index.html" class="btn btn-neutral float-left" title="Memory Backends" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -227,7 +227,7 @@ and Redis Pub/Sub in Coach when used in the distributed mode.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -244,27 +244,16 @@ and Redis Pub/Sub in Coach when used in the distributed mode.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/spaces.html
+++ b/docs/components/spaces.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Spaces &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Spaces &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../_static/jquery.js"></script>
+        <script type="text/javascript" src="../_static/underscore.js"></script>
+        <script type="text/javascript" src="../_static/doctools.js"></script>
+        <script type="text/javascript" src="../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Core Types" href="core_types.html" />
    <link href="../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -207,52 +210,44 @@
 <dt id="rl_coach.spaces.Space">
 <em class="property">class </em><code class="descclassname">rl_coach.spaces.</code><code class="descname">Space</code><span class="sig-paren">(</span><em>shape: Union[int, tuple, list, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/spaces.html#Space"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space" title="Permalink to this definition">¶</a></dt>
 <dd><p>A space defines a set of valid values</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>shape</strong> – the shape of the space</li>
-<li><strong>low</strong> – the lowest values possible in the space. can be an array defining the lowest values per point,
-or a single value defining the general lowest values</li>
-<li><strong>high</strong> – the highest values possible in the space. can be an array defining the highest values per point,
-or a single value defining the general highest values</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>shape</strong> – the shape of the space</p></li>
+<li><p><strong>low</strong> – the lowest values possible in the space. can be an array defining the lowest values per point,
+or a single value defining the general lowest values</p></li>
+<li><p><strong>high</strong> – the highest values possible in the space. can be an array defining the highest values per point,
+or a single value defining the general highest values</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.spaces.Space.contains">
 <code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> &#x2192; bool<a class="reference internal" href="../_modules/rl_coach/spaces.html#Space.contains"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space.contains" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if value is contained by this space. The shape must match and
 all of the values must be within the low and high bounds.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.Space.is_valid_index">
 <code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> &#x2192; bool<a class="reference internal" href="../_modules/rl_coach/spaces.html#Space.is_valid_index"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space.is_valid_index" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -260,14 +255,11 @@ all of the values must be within the low and high bounds.</p>
 <code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/spaces.html#Space.sample"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space.sample" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
 bounds are defined</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A numpy array sampled from the space</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -283,32 +275,28 @@ bounds are defined</p>
 <code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> &#x2192; bool<a class="headerlink" href="#rl_coach.spaces.ObservationSpace.contains" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if value is contained by this space. The shape must match and
 all of the values must be within the low and high bounds.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.ObservationSpace.is_valid_index">
 <code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> &#x2192; bool<a class="headerlink" href="#rl_coach.spaces.ObservationSpace.is_valid_index" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -316,14 +304,11 @@ all of the values must be within the low and high bounds.</p>
 <code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="headerlink" href="#rl_coach.spaces.ObservationSpace.sample" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
 bounds are defined</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A numpy array sampled from the space</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -368,16 +353,14 @@ represent a RGB image, or a grayscale image.</p>
 <dt id="rl_coach.spaces.ActionSpace.clip_action_to_space">
 <code class="descname">clip_action_to_space</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List]</em><span class="sig-paren">)</span> &#x2192; Union[int, float, numpy.ndarray, List]<a class="reference internal" href="../_modules/rl_coach/spaces.html#ActionSpace.clip_action_to_space"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.ActionSpace.clip_action_to_space" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given an action, clip its values to fit to the action space ranges</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – a given action</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the clipped action</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – a given action</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the clipped action</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -385,32 +368,28 @@ represent a RGB image, or a grayscale image.</p>
 <code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> &#x2192; bool<a class="headerlink" href="#rl_coach.spaces.ActionSpace.contains" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if value is contained by this space. The shape must match and
 all of the values must be within the low and high bounds.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.ActionSpace.is_valid_index">
 <code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> &#x2192; bool<a class="headerlink" href="#rl_coach.spaces.ActionSpace.is_valid_index" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -418,28 +397,22 @@ all of the values must be within the low and high bounds.</p>
 <code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="headerlink" href="#rl_coach.spaces.ActionSpace.sample" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
 bounds are defined</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A numpy array sampled from the space</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.ActionSpace.sample_with_info">
 <code class="descname">sample_with_info</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.ActionInfo<a class="reference internal" href="../_modules/rl_coach/spaces.html#ActionSpace.sample_with_info"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.ActionSpace.sample_with_info" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get a random action with additional “fake” info</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">An action info instance</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>An action info instance</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -505,21 +478,17 @@ by itself an action space. In Starcraft, the arguments are Discrete action space
 agents can use it as an output action space.
 The class acts as a wrapper to the target space. So after setting the target space, all the values of the class
 will match the values of the target space (the shape, low, high, etc.)</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>goal_name</strong> – the name of the observation space to use as the achieved goal.</li>
-<li><strong>reward_type</strong> – the reward type to use for converting distances from goal to rewards</li>
-<li><strong>distance_metric</strong> – the distance metric to use. could be either one of the distances in the
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>goal_name</strong> – the name of the observation space to use as the achieved goal.</p></li>
+<li><p><strong>reward_type</strong> – the reward type to use for converting distances from goal to rewards</p></li>
+<li><p><strong>distance_metric</strong> – the distance metric to use. could be either one of the distances in the
 DistanceMetric enum, or a custom function that gets two vectors as input and
-returns the distance between them</li>
+returns the distance between them</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 <dl class="class">
 <dt id="rl_coach.spaces.GoalsSpace.DistanceMetric">
 <em class="property">class </em><code class="descname">DistanceMetric</code><a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.DistanceMetric"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.DistanceMetric" title="Permalink to this definition">¶</a></dt>
@@ -530,16 +499,14 @@ returns the distance between them</li>
 <dt id="rl_coach.spaces.GoalsSpace.clip_action_to_space">
 <code class="descname">clip_action_to_space</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List]</em><span class="sig-paren">)</span> &#x2192; Union[int, float, numpy.ndarray, List]<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.clip_action_to_space" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given an action, clip its values to fit to the action space ranges</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – a given action</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the clipped action</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – a given action</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the clipped action</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -547,90 +514,76 @@ returns the distance between them</li>
 <code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> &#x2192; bool<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.contains" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if value is contained by this space. The shape must match and
 all of the values must be within the low and high bounds.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.GoalsSpace.distance_from_goal">
 <code class="descname">distance_from_goal</code><span class="sig-paren">(</span><em>goal: numpy.ndarray</em>, <em>state: dict</em><span class="sig-paren">)</span> &#x2192; float<a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.distance_from_goal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.distance_from_goal" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a state, check its distance from the goal</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>goal</strong> – a numpy array representing the goal</li>
-<li><strong>state</strong> – a dict representing the state</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>goal</strong> – a numpy array representing the goal</p></li>
+<li><p><strong>state</strong> – a dict representing the state</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the distance from the goal</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the distance from the goal</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.GoalsSpace.get_reward_for_goal_and_state">
 <code class="descname">get_reward_for_goal_and_state</code><span class="sig-paren">(</span><em>goal: numpy.ndarray</em>, <em>state: dict</em><span class="sig-paren">)</span> &#x2192; Tuple[float, bool]<a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.get_reward_for_goal_and_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.get_reward_for_goal_and_state" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a state, check if the goal was reached and return a reward accordingly</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>goal</strong> – a numpy array representing the goal</li>
-<li><strong>state</strong> – a dict representing the state</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>goal</strong> – a numpy array representing the goal</p></li>
+<li><p><strong>state</strong> – a dict representing the state</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the reward for the current goal and state pair and a boolean representing if the goal was reached</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the reward for the current goal and state pair and a boolean representing if the goal was reached</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.GoalsSpace.goal_from_state">
 <code class="descname">goal_from_state</code><span class="sig-paren">(</span><em>state: Dict</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.goal_from_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.goal_from_state" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a state, extract an observation according to the goal_name</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – a dictionary of observations</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the observation corresponding to the goal_name</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>state</strong> – a dictionary of observations</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the observation corresponding to the goal_name</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.GoalsSpace.is_valid_index">
 <code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> &#x2192; bool<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.is_valid_index" title="Permalink to this definition">¶</a></dt>
 <dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -638,28 +591,22 @@ all of the values must be within the low and high bounds.</p>
 <code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.sample" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
 bounds are defined</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A numpy array sampled from the space</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.spaces.GoalsSpace.sample_with_info">
 <code class="descname">sample_with_info</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.ActionInfo<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.sample_with_info" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get a random action with additional “fake” info</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">An action info instance</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>An action info instance</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -678,7 +625,7 @@ bounds are defined</p>
        <a href="additional_parameters.html" class="btn btn-neutral float-right" title="Additional Parameters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="core_types.html" class="btn btn-neutral" title="Core Types" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="core_types.html" class="btn btn-neutral float-left" title="Core Types" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -687,7 +634,7 @@ bounds are defined</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -704,27 +651,16 @@ bounds are defined</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../_static/jquery.js"></script>
-        <script type="text/javascript" src="../_static/underscore.js"></script>
-        <script type="text/javascript" src="../_static/doctools.js"></script>
-        <script type="text/javascript" src="../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>