Enabling Coach Documentation to be run even when environments are not installed (#326)

2026-03-18 07:43:47 +01:00 · 2019-05-27 10:46:07 +03:00
parent 2b7d536da4
commit 342b7184bc
157 changed files with 5167 additions and 7477 deletions
--- a/docs/components/agents/imitation/bc.html
+++ b/docs/components/agents/imitation/bc.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Behavioral Cloning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Behavioral Cloning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="ACER" href="../policy_optimization/acer.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -230,9 +233,9 @@ These demonstrations are given as state, action tuples, and with no reward.
 The training goal is to reduce the difference between the actions predicted by the network and the actions taken by
 the expert for each state.</p>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Use the current states as input to the network, and the expert actions as the targets of the network.</li>
-<li>For the network head, we use the policy head, which uses the cross entropy loss function.</li>
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Use the current states as input to the network, and the expert actions as the targets of the network.</p></li>
+<li><p>For the network head, we use the policy head, which uses the cross entropy loss function.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.bc_agent.BCAlgorithmParameters">
@@ -254,7 +257,7 @@ the expert for each state.</p>
        <a href="../value_optimization/bs_dqn.html" class="btn btn-neutral float-right" title="Bootstrapped DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/acer.html" class="btn btn-neutral" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/acer.html" class="btn btn-neutral float-left" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -263,7 +266,7 @@ the expert for each state.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -280,27 +283,16 @@ the expert for each state.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/imitation/cil.html
+++ b/docs/components/agents/imitation/cil.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Conditional Imitation Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Conditional Imitation Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Categorical DQN" href="../value_optimization/categorical_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -233,25 +236,22 @@ the expert for each state.
 In conditional imitation learning, each transition is assigned a class, which determines the goal that was pursuit
 in that transitions. For example, 3 possible classes can be: turn right, turn left and follow lane.</p>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
-of transitions will be sampled from each class index.</li>
-<li>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
+<li><p>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
+of transitions will be sampled from each class index.</p></li>
+<li><p>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
 corresponding to the state classes. For the other heads, set the targets to match the currently predicted values,
-so that the loss for the other heads will be zeroed out.</li>
-<li>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</li>
+so that the loss for the other heads will be zeroed out.</p></li>
+<li><p>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.cil_agent.CILAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.cil_agent.</code><code class="descname">CILAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/cil_agent.html#CILAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.cil_agent.CILAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state_key_with_the_class_index</strong> – (str)
-The key of the state dictionary which corresponds to the value that will be used to control the class index.</td>
-</tr>
-</tbody>
-</table>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>state_key_with_the_class_index</strong> – (str)
+The key of the state dictionary which corresponds to the value that will be used to control the class index.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -269,7 +269,7 @@ The key of the state dictionary which corresponds to the value that will be used
        <a href="../policy_optimization/cppo.html" class="btn btn-neutral float-right" title="Clipped Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral float-left" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -278,7 +278,7 @@ The key of the state dictionary which corresponds to the value that will be used

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -295,27 +295,16 @@ The key of the state dictionary which corresponds to the value that will be used
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/index.html
+++ b/docs/components/agents/index.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Agents &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Agents &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Adding a New Environment" href="../../contributing/add_env.html" />
    <link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -241,59 +244,50 @@ A detailed description of those algorithms can be found by navigating to each of
 <dl class="class">
 <dt id="rl_coach.base_parameters.AgentParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">AgentParameters</code><span class="sig-paren">(</span><em>algorithm: rl_coach.base_parameters.AlgorithmParameters, exploration: ExplorationParameters, memory: MemoryParameters, networks: Dict[str, rl_coach.base_parameters.NetworkParameters], visualization: rl_coach.base_parameters.VisualizationParameters = &lt;rl_coach.base_parameters.VisualizationParameters object&gt;</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/base_parameters.html#AgentParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.AgentParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
 The parameters used for the specific algorithm used by the agent.
-These parameters can be later referenced in the agent implementation through self.ap.algorithm.</li>
-<li><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
+These parameters can be later referenced in the agent implementation through self.ap.algorithm.</p></li>
+<li><p><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
 space types and their corresponding ExplorationParameters. If a dictionary was used,
 when the agent will be instantiated, the correct exploration policy parameters will be used
 according to the real type of the environment action space.
-These parameters will be used to instantiate the exporation policy.</li>
-<li><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</li>
-<li><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
+These parameters will be used to instantiate the exporation policy.</p></li>
+<li><p><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</p></li>
+<li><p><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
 as a class inheriting NetworkParameters. Each element will be used in order to instantiate
 a NetworkWrapper class, and all the network wrappers will be stored in the agent under
 self.network_wrappers. self.network_wrappers is a dict mapping between the network name that
-was given in the networks dict, and the instantiated network wrapper.</li>
-<li><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
-used for visualization purposes, such as printing to the screen, rendering, and saving videos.</li>
+was given in the networks dict, and the instantiated network wrapper.</p></li>
+<li><p><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
+used for visualization purposes, such as printing to the screen, rendering, and saving videos.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 <dl class="class">
 <dt id="rl_coach.agents.agent.Agent">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.agent.</code><code class="descname">Agent</code><span class="sig-paren">(</span><em>agent_parameters: rl_coach.base_parameters.AgentParameters</em>, <em>parent: Union[LevelManager</em>, <em>CompositeAgent] = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</td>
-</tr>
-</tbody>
-</table>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</p>
+</dd>
+</dl>
 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.act">
 <code class="descname">act</code><span class="sig-paren">(</span><em>action: Union[None</em>, <em>int</em>, <em>float</em>, <em>numpy.ndarray</em>, <em>List] = None</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.ActionInfo<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.act"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.act" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given the agents current knowledge, decide on the next action to apply to the environment</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – An action to take, overriding whatever the current policy is</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">An ActionInfo object, which contains the action and any additional info from the action decision process</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – An action to take, overriding whatever the current policy is</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>An ActionInfo object, which contains the action and any additional info from the action decision process</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -302,21 +296,17 @@ used for visualization purposes, such as printing to the screen, rendering, and
 <dd><p>This function is a wrapper to allow having the same calls for shared or unshared memories.
 It should be used instead of calling the memory directly in order to allow different algorithms to work
 both with a shared and a local memory.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>func</strong> – the name of the memory function to call</li>
-<li><strong>args</strong> – the arguments to supply to the function</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>func</strong> – the name of the memory function to call</p></li>
+<li><p><strong>args</strong> – the arguments to supply to the function</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the return value of the function</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the return value of the function</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -324,16 +314,14 @@ both with a shared and a local memory.</p>
 <code class="descname">choose_action</code><span class="sig-paren">(</span><em>curr_state</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.choose_action"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.choose_action" title="Permalink to this definition">¶</a></dt>
 <dd><p>choose an action to act with in the current episode being played. Different behavior might be exhibited when
 training or testing.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>curr_state</strong> – the current state to act upon.</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">chosen action, some action value describing the action (q-value, probability, etc)</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>curr_state</strong> – the current state to act upon.</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>chosen action, some action value describing the action (q-value, probability, etc)</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -351,14 +339,11 @@ training or testing.</p>
 <dd><p>Create all the networks of the agent.
 The network creation will be done after setting the environment parameters for the agent, since they are needed
 for creating the network.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A list containing all the networks</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>A list containing all the networks</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -367,37 +352,31 @@ for creating the network.</p>
 <dd><p>Get a prediction from the agent with regard to the requested prediction_type.
 If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
 raise a ValueException.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>states</strong> – The states to get a prediction for</li>
-<li><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>states</strong> – The states to get a prediction for</p></li>
+<li><p><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the predicted values</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the predicted values</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.get_state_embedding">
 <code class="descname">get_state_embedding</code><span class="sig-paren">(</span><em>state: dict</em><span class="sig-paren">)</span> &#x2192; numpy.ndarray<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.get_state_embedding"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.get_state_embedding" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a state, get the corresponding state embedding  from the main network</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – a state dict</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy embedding vector</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>state</strong> – a state dict</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a numpy embedding vector</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -406,14 +385,11 @@ raise a ValueException.</p>
 <dd><p>Make any changes needed when each episode is ended.
 This includes incrementing counters, updating full episode dependent values, updating logs, etc.
 This function is called right after each episode is ended.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -421,44 +397,36 @@ This function is called right after each episode is ended.</p>
 <code class="descname">init_environment_dependent_modules</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.init_environment_dependent_modules"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.init_environment_dependent_modules" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initialize any modules that depend on knowing information about the environment such as the action space or
 the observation space</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.learn_from_batch">
 <code class="descname">learn_from_batch</code><span class="sig-paren">(</span><em>batch</em><span class="sig-paren">)</span> &#x2192; Tuple[float, List, List]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.learn_from_batch"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.learn_from_batch" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a batch of transitions, calculates their target values and updates the network.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>batch</strong> – A list of transitions</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The total loss of the training, the loss per head and the unclipped gradients</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>batch</strong> – A list of transitions</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>The total loss of the training, the loss per head and the unclipped gradients</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.log_to_screen">
 <code class="descname">log_to_screen</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.log_to_screen"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.log_to_screen" title="Permalink to this definition">¶</a></dt>
 <dd><p>Write an episode summary line to the terminal</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -467,59 +435,48 @@ the observation space</p>
 <dd><p>Given a response from the environment, distill the observation from it and store it for later use.
 The response should be a dictionary containing the performed action, the new observation and measurements,
 the reward, a game over flag and any additional information necessary.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>env_response</strong> – result of call from environment.step(action)</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a boolean value which determines if the agent has decided to terminate the episode after seeing the
-given observation</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>env_response</strong> – result of call from environment.step(action)</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a boolean value which determines if the agent has decided to terminate the episode after seeing the
+given observation</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.agents.agent.Agent.parent">
 <code class="descname">parent</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.parent" title="Permalink to this definition">¶</a></dt>
 <dd><p>Get the parent class of the agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the current phase</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>the current phase</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="attribute">
 <dt id="rl_coach.agents.agent.Agent.phase">
 <code class="descname">phase</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.phase" title="Permalink to this definition">¶</a></dt>
 <dd><p>The current running phase of the agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">RunPhase</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>RunPhase</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.post_training_commands">
 <code class="descname">post_training_commands</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.post_training_commands"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.post_training_commands" title="Permalink to this definition">¶</a></dt>
 <dd><p>A function which allows adding any functionality that is required to run right after the training phase ends.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -527,45 +484,37 @@ given observation</td>
 <code class="descname">prepare_batch_for_inference</code><span class="sig-paren">(</span><em>states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str</em><span class="sig-paren">)</span> &#x2192; Dict[str, numpy.array]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.prepare_batch_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.prepare_batch_for_inference" title="Permalink to this definition">¶</a></dt>
 <dd><p>Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
 observations together, measurements together, etc.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
-corresponding observation</li>
-<li><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
-the observation relevant for the network from the states.</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
+corresponding observation</p></li>
+<li><p><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
+the observation relevant for the network from the states.</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dictionary containing a list of values from all the given states for each of the observations</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>A dictionary containing a list of values from all the given states for each of the observations</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.register_signal">
 <code class="descname">register_signal</code><span class="sig-paren">(</span><em>signal_name: str</em>, <em>dump_one_value_per_episode: bool = True</em>, <em>dump_one_value_per_step: bool = False</em><span class="sig-paren">)</span> &#x2192; rl_coach.utils.Signal<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.register_signal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.register_signal" title="Permalink to this definition">¶</a></dt>
 <dd><p>Register a signal such that its statistics will be dumped and be viewable through dashboard</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</li>
-<li><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</li>
-<li><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</p></li>
+<li><p><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</p></li>
+<li><p><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the created signal</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the created signal</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -574,46 +523,39 @@ the observation relevant for the network from the states.</li>
 <dd><p>Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
 evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
 by val, and by the current phase set in self.phase.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – The new phase to change to</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>val</strong> – The new phase to change to</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.reset_internal_state">
 <code class="descname">reset_internal_state</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.reset_internal_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.reset_internal_state" title="Permalink to this definition">¶</a></dt>
 <dd><p>Reset all the episodic parameters. This function is called right before each episode starts.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.restore_checkpoint">
 <code class="descname">restore_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_dir: str</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.restore_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.restore_checkpoint" title="Permalink to this definition">¶</a></dt>
 <dd><p>Allows agents to store additional information when saving checkpoints.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -621,51 +563,42 @@ by val, and by the current phase set in self.phase.</p>
 <code class="descname">run_off_policy_evaluation</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="headerlink" href="#rl_coach.agents.agent.Agent.run_off_policy_evaluation" title="Permalink to this definition">¶</a></dt>
 <dd><p>Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
 Should only be implemented for off-policy RL algorithms.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference">
 <code class="descname">run_pre_network_filter_for_inference</code><span class="sig-paren">(</span><em>state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True</em><span class="sig-paren">)</span> &#x2192; Dict[str, numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.run_pre_network_filter_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference" title="Permalink to this definition">¶</a></dt>
 <dd><p>Run filters which where defined for being applied right before using the state for inference.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
-<li><strong>state</strong> – The state to run the filters on</li>
-<li><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</li>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>state</strong> – The state to run the filters on</p></li>
+<li><p><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</p></li>
 </ul>
-</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">The filtered state</p>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>The filtered state</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.save_checkpoint">
 <code class="descname">save_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_prefix: str</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.save_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.save_checkpoint" title="Permalink to this definition">¶</a></dt>
 <dd><p>Allows agents to store additional information when saving checkpoints.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -673,16 +606,14 @@ Should only be implemented for off-policy RL algorithms.</p>
 <code class="descname">set_environment_parameters</code><span class="sig-paren">(</span><em>spaces: rl_coach.spaces.SpacesDefinition</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_environment_parameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_environment_parameters" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
 dependent on those values, by calling init_environment_dependent_modules</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>spaces</strong> – the environment spaces definition</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>spaces</strong> – the environment spaces definition</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -692,58 +623,47 @@ dependent on those values, by calling init_environment_dependent_modules</p>
 has another master agent that is controlling it. In such cases, the master agent can define the goals for the
 slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent
 in-action-space.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – The action that should be set as the directive</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>action</strong> – The action that should be set as the directive</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p></p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.set_session">
 <code class="descname">set_session</code><span class="sig-paren">(</span><em>sess</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_session" title="Permalink to this definition">¶</a></dt>
 <dd><p>Set the deep learning framework session for all the agents in the composite agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.setup_logger">
 <code class="descname">setup_logger</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.setup_logger"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.setup_logger" title="Permalink to this definition">¶</a></dt>
 <dd><p>Setup the logger for the agent</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.sync">
 <code class="descname">sync</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.sync"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.sync" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sync the global network parameters to local networks</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -752,14 +672,11 @@ in-action-space.</p>
 <dd><p>Check if a training phase should be done as configured by num_consecutive_playing_steps.
 If it should, then do several training steps as configured by num_consecutive_training_steps.
 A single training iteration: Sample a batch, train on it and update target networks.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The total training loss during the training iterations.</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The total training loss during the training iterations.</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -768,28 +685,22 @@ A single training iteration: Sample a batch, train on it and update target netwo
 <dd><p>Updates the episodic log file with all the signal values from the most recent episode.
 Additional signals for logging can be set by the creating a new signal using self.register_signal,
 and then updating it with some internal agent values.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
 <dt id="rl_coach.agents.agent.Agent.update_step_in_episode_log">
 <code class="descname">update_step_in_episode_log</code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_step_in_episode_log"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_step_in_episode_log" title="Permalink to this definition">¶</a></dt>
 <dd><p>Updates the in-episode log file with all the signal values from the most recent step.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="method">
@@ -797,16 +708,14 @@ and then updating it with some internal agent values.</p>
 <code class="descname">update_transition_before_adding_to_replay_buffer</code><span class="sig-paren">(</span><em>transition: rl_coach.core_types.Transition</em><span class="sig-paren">)</span> &#x2192; rl_coach.core_types.Transition<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_transition_before_adding_to_replay_buffer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_transition_before_adding_to_replay_buffer" title="Permalink to this definition">¶</a></dt>
 <dd><p>Allows agents to update the transition just before adding it to the replay buffer.
 Can be useful for agents that want to tweak the reward, termination signal, etc.</p>
-<table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition</strong> – the transition to update</td>
-</tr>
-<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the updated transition</td>
-</tr>
-</tbody>
-</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>transition</strong> – the transition to update</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>the updated transition</p>
+</dd>
+</dl>
 </dd></dl>

 </dd></dl>
@@ -824,7 +733,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
        <a href="policy_optimization/ac.html" class="btn btn-neutral float-right" title="Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../../contributing/add_env.html" class="btn btn-neutral" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../../contributing/add_env.html" class="btn btn-neutral float-left" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -833,7 +742,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -850,27 +759,16 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/other/dfp.html
+++ b/docs/components/agents/other/dfp.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Direct Future Prediction &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Direct Future Prediction &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Soft Actor-Critic" href="../policy_optimization/sac.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -228,13 +231,13 @@
 <div class="section" id="choosing-an-action">
 <h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
+<li><p>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
 The output of the network is the predicted future measurements for time-steps <span class="math notranslate nohighlight">\(t+1,t+2,t+4,t+8,t+16\)</span> and
-<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</li>
-<li>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
-and the result is a single vector of future values for each action.</li>
-<li>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</li>
-<li>The action values are passed to the exploration policy to decide on the action to use.</li>
+<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</p></li>
+<li><p>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
+and the result is a single vector of future values for each action.</p></li>
+<li><p>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</p></li>
+<li><p>The action values are passed to the exploration policy to decide on the action to use.</p></li>
 </ol>
 </div>
 <div class="section" id="training-the-network">
@@ -247,39 +250,35 @@ For the actions that were not taken, the targets are the current values.</p>
 <dl class="class">
 <dt id="rl_coach.agents.dfp_agent.DFPAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.dfp_agent.</code><code class="descname">DFPAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/dfp_agent.html#DFPAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.dfp_agent.DFPAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_predicted_steps_ahead</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_predicted_steps_ahead</strong> – (int)
 Number of future steps to predict measurements for. The future steps won’t be sequential, but rather jump
 in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4.
-The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</li>
-<li><strong>goal_vector</strong> – (List[float])
+The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</p></li>
+<li><p><strong>goal_vector</strong> – (List[float])
 The goal vector will weight each of the measurements to form an optimization goal. The vector should have
 the same length as the number of measurements, and it will be vector multiplied by the measurements.
 Positive values correspond to trying to maximize the particular measurement, and negative values
-correspond to trying to minimize the particular measurement.</li>
-<li><strong>future_measurements_weights</strong> – (List[float])
+correspond to trying to minimize the particular measurement.</p></li>
+<li><p><strong>future_measurements_weights</strong> – (List[float])
 The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
 goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
 then only the 3 last timesteps will be taken into account, according to the weights in the
-future_measurements_weights vector.</li>
-<li><strong>use_accumulated_reward_as_measurement</strong> – (bool)
+future_measurements_weights vector.</p></li>
+<li><p><strong>use_accumulated_reward_as_measurement</strong> – (bool)
 If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
 the measurements vector in the state. This van be useful in environments where the given measurements don’t
-include enough information for the particular goal the agent should achieve.</li>
-<li><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
-Dictates how to handle measurements that are outside the episode length.</li>
-<li><strong>scale_measurements_targets</strong> – (Dict[str, float])
+include enough information for the particular goal the agent should achieve.</p></li>
+<li><p><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
+Dictates how to handle measurements that are outside the episode length.</p></li>
+<li><p><strong>scale_measurements_targets</strong> – (Dict[str, float])
 Allows rescaling the values of each of the measurements available. This van be useful when the measurements
-have a different scale and you want to normalize them to the same scale.</li>
+have a different scale and you want to normalize them to the same scale.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -297,7 +296,7 @@ have a different scale and you want to normalize them to the same scale.</li>
        <a href="../value_optimization/double_dqn.html" class="btn btn-neutral float-right" title="Double DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/sac.html" class="btn btn-neutral" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/sac.html" class="btn btn-neutral float-left" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -306,7 +305,7 @@ have a different scale and you want to normalize them to the same scale.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -323,27 +322,16 @@ have a different scale and you want to normalize them to the same scale.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/ac.html
+++ b/docs/components/agents/policy_optimization/ac.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Actor-Critic &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Actor-Critic &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Agents" href="../index.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -235,41 +238,37 @@ distribution assigned with these probabilities. When testing, the action with th
 <p>A batch of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions is used, and the advantages are calculated upon it.</p>
 <p>Advantages can be calculated by either of the following methods (configured by the selected preset) -</p>
 <ol class="arabic simple">
-<li><strong>A_VALUE</strong> - Estimating advantage directly:
+<li><p><strong>A_VALUE</strong> - Estimating advantage directly:
 <span class="math notranslate nohighlight">\(A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)\)</span>
-where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</li>
-<li><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</li>
+where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</p></li>
+<li><p><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</p></li>
 </ol>
 <p>The advantages are then used in order to accumulate gradients according to
 <span class="math notranslate nohighlight">\(L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]\)</span></p>
 <dl class="class">
 <dt id="rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.actor_critic_agent.</code><code class="descname">ActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/actor_critic_agent.html#ActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
-The value that will be used to rescale the policy gradient</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+The value that will be used to rescale the policy gradient</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes to wait before applying the accumulated gradients to the network.
-The training iterations only accumulate gradients without actually applying them.</li>
-<li><strong>beta_entropy</strong> – (float)
-The weight that will be given to the entropy regularization which is used in order to improve exploration.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+The training iterations only accumulate gradients without actually applying them.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
+The weight that will be given to the entropy regularization which is used in order to improve exploration.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
-accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
-<li><strong>gae_lambda</strong> – (float)
+accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
+<li><p><strong>gae_lambda</strong> – (float)
 If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
-scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</li>
-<li><strong>estimate_state_value_using_gae</strong> – (bool)
-If set to True, the state value targets for the V head will be estimated using the GAE scheme.</li>
+scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</p></li>
+<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
+If set to True, the state value targets for the V head will be estimated using the GAE scheme.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -287,7 +286,7 @@ If set to True, the state value targets for the V head will be estimated using t
        <a href="acer.html" class="btn btn-neutral float-right" title="ACER" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../index.html" class="btn btn-neutral" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../index.html" class="btn btn-neutral float-left" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -296,7 +295,7 @@ If set to True, the state value targets for the V head will be estimated using t

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -313,27 +312,16 @@ If set to True, the state value targets for the V head will be estimated using t
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/acer.html
+++ b/docs/components/agents/policy_optimization/acer.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>ACER &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>ACER &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Actor-Critic" href="ac.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -236,11 +239,11 @@ distribution assigned with these probabilities. When testing, the action with th
 and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-policy updates from batches of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions sampled from the replay buffer.</p>
 <p>Each update perform the following procedure:</p>
 <ol class="arabic">
-<li><p class="first"><strong>Calculate state values:</strong></p>
+<li><p><strong>Calculate state values:</strong></p>
 <div class="math notranslate nohighlight">
 \[V(s_t) = \mathbb{E}_{a \sim \pi} [Q(s_t,a)]\]</div>
 </li>
-<li><p class="first"><strong>Calculate Q retrace:</strong></p>
+<li><p><strong>Calculate Q retrace:</strong></p>
 <blockquote>
 <div><div class="math notranslate nohighlight">
 \[Q^{ret}(s_t,a_t) = r_t +\gamma \bar{\rho}_{t+1}[Q^{ret}(s_{t+1},a_{t+1}) - Q(s_{t+1},a_{t+1})] + \gamma V(s_{t+1})\]</div>
@@ -248,7 +251,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
 \[\text{where} \quad \bar{\rho}_{t} = \min{\left\{c,\rho_t\right\}},\quad \rho_t=\frac{\pi (a_t \mid s_t)}{\mu (a_t \mid s_t)}\]</div>
 </div></blockquote>
 </li>
-<li><p class="first"><strong>Accumulate gradients:</strong></p>
+<li><p><strong>Accumulate gradients:</strong></p>
 <blockquote>
 <div><p><span class="math notranslate nohighlight">\(\bullet\)</span> <strong>Policy gradients (with bias correction):</strong></p>
 <blockquote>
@@ -263,7 +266,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
 </div></blockquote>
 </div></blockquote>
 </li>
-<li><p class="first"><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
+<li><p><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
 <blockquote>
 <div><div class="math notranslate nohighlight">
 \[\hat{g}_t^{trust-region} = \hat{g}_t^{policy} - \max \left\{0, \frac{k^T \hat{g}_t^{policy} - \delta}{\lVert k \rVert_2^2}\right\} k\]</div>
@@ -277,39 +280,35 @@ The goal of the trust region update is to the difference between the updated pol
 <dl class="class">
 <dt id="rl_coach.agents.acer_agent.ACERAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.acer_agent.</code><code class="descname">ACERAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/acer_agent.html#ACERAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.acer_agent.ACERAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
-accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
-<li><strong>ratio_of_replay</strong> – (int)
-The number of off-policy training iterations in each ACER iteration.</li>
-<li><strong>num_transitions_to_start_replay</strong> – (int)
+accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
+<li><p><strong>ratio_of_replay</strong> – (int)
+The number of off-policy training iterations in each ACER iteration.</p></li>
+<li><p><strong>num_transitions_to_start_replay</strong> – (int)
 Number of environment steps until ACER starts to train off-policy from the experience replay.
 This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
-the experience replay to start the off-policy training.</li>
-<li><strong>rate_for_copying_weights_to_target</strong> – (float)
+the experience replay to start the off-policy training.</p></li>
+<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
 The rate of the exponential moving average for the average policy which is used for the trust region optimization.
-The target network in this algorithm is used as the average policy.</li>
-<li><strong>importance_weight_truncation</strong> – (float)
-The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</li>
-<li><strong>use_trust_region_optimization</strong> – (bool)
+The target network in this algorithm is used as the average policy.</p></li>
+<li><p><strong>importance_weight_truncation</strong> – (float)
+The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</p></li>
+<li><p><strong>use_trust_region_optimization</strong> – (bool)
 If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
-the average policy and the current one, to bound the change of the policy during the network update.</li>
-<li><strong>max_KL_divergence</strong> – (float)
+the average policy and the current one, to bound the change of the policy during the network update.</p></li>
+<li><p><strong>max_KL_divergence</strong> – (float)
 The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
-for this parameter to have an effect.</li>
-<li><strong>beta_entropy</strong> – (float)
+for this parameter to have an effect.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 An entropy regulaization term can be added to the loss function in order to control exploration. This term
-is weighted using the beta value defined by beta_entropy.</li>
+is weighted using the beta value defined by beta_entropy.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -327,7 +326,7 @@ is weighted using the beta value defined by beta_entropy.</li>
        <a href="../imitation/bc.html" class="btn btn-neutral float-right" title="Behavioral Cloning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="ac.html" class="btn btn-neutral" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="ac.html" class="btn btn-neutral float-left" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -336,7 +335,7 @@ is weighted using the beta value defined by beta_entropy.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -353,27 +352,16 @@ is weighted using the beta value defined by beta_entropy.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/cppo.html
+++ b/docs/components/agents/policy_optimization/cppo.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Clipped Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Clipped Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Conditional Imitation Learning" href="../imitation/cil.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -233,17 +236,14 @@
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>Very similar to PPO, with several small (but very simplifying) changes:</p>
 <ol class="arabic">
-<li><p class="first">Train both the value and policy networks, simultaneously, by defining a single loss function,
-which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p>
-</li>
-<li><p class="first">The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p>
-</li>
-<li><p class="first">Value targets are now also calculated based on the GAE advantages.
+<li><p>Train both the value and policy networks, simultaneously, by defining a single loss function,
+which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p></li>
+<li><p>The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p></li>
+<li><p>Value targets are now also calculated based on the GAE advantages.
 In this method, the <span class="math notranslate nohighlight">\(V\)</span> values are predicted from the critic network, and then added to the GAE based advantages,
 in order to get a <span class="math notranslate nohighlight">\(Q\)</span> value for each action. Now, since our critic network is predicting a <span class="math notranslate nohighlight">\(V\)</span> value for
-each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p>
-</li>
-<li><p class="first">Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
+each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p></li>
+<li><p>Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
 <span class="math notranslate nohighlight">\(r_t(\theta) =\frac{\pi_{\theta}(a|s)}{\pi_{\theta_{old}}(a|s)}\)</span> is clipped, to achieve a similar effect.
 This is done by defining the policy’s loss function to be the minimum between the standard surrogate loss and an epsilon
 clipped surrogate loss:</p>
@@ -253,46 +253,42 @@ clipped surrogate loss:</p>
 <dl class="class">
 <dt id="rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.clipped_ppo_agent.</code><code class="descname">ClippedPPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/clipped_ppo_agent.html#ClippedPPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
 This represents how the critic will be used to update the actor. The critic value function is typically used
 to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
-advantage of the action, or the generalized advantage estimation (GAE) value.</li>
-<li><strong>gae_lambda</strong> – (float)
+advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
+<li><p><strong>gae_lambda</strong> – (float)
 The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
 estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
-n-step estimations.</li>
-<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
+n-step estimations.</p></li>
+<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
 If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
 clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
 This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
-implementations.</li>
-<li><strong>value_targets_mix_fraction</strong> – (float)
+implementations.</p></li>
+<li><p><strong>value_targets_mix_fraction</strong> – (float)
 The targets for the value network are an exponential weighted moving average which uses this mix fraction to
 define how much of the new targets will be taken into account when calculating the loss.
-This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
-<li><strong>estimate_state_value_using_gae</strong> – (bool)
-If set to True, the state value will be estimated using the GAE technique.</li>
-<li><strong>use_kl_regularization</strong> – (bool)
+This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
+<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
+If set to True, the state value will be estimated using the GAE technique.</p></li>
+<li><p><strong>use_kl_regularization</strong> – (bool)
 If set to True, the loss function will be regularized using the KL diveregence between the current and new
-policy, to bound the change of the policy during the network update.</li>
-<li><strong>beta_entropy</strong> – (float)
+policy, to bound the change of the policy during the network update.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 An entropy regulaization term can be added to the loss function in order to control exploration. This term
-is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
-<li><strong>optimization_epochs</strong> – (int)
+is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
+<li><p><strong>optimization_epochs</strong> – (int)
 For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
-optimization_epochs value.</li>
-<li><strong>optimization_epochs</strong> – (Schedule)
-Can be used to define a schedule over the clipping of the likelihood ratio.</li>
+optimization_epochs value.</p></li>
+<li><p><strong>optimization_epochs</strong> – (Schedule)
+Can be used to define a schedule over the clipping of the likelihood ratio.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -310,7 +306,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
        <a href="ddpg.html" class="btn btn-neutral float-right" title="Deep Deterministic Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../imitation/cil.html" class="btn btn-neutral" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../imitation/cil.html" class="btn btn-neutral float-left" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -319,7 +315,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -336,27 +332,16 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/ddpg.html
+++ b/docs/components/agents/policy_optimization/ddpg.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Deep Deterministic Policy Gradient &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Deep Deterministic Policy Gradient &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Clipped Proximal Policy Optimization" href="cppo.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -235,14 +238,14 @@ to add exploration noise to the action. When testing, use the mean vector <span
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>Start by sampling a batch of transitions from the experience replay.</p>
 <ul>
-<li><p class="first">To train the <strong>critic network</strong>, use the following targets:</p>
+<li><p>To train the <strong>critic network</strong>, use the following targets:</p>
 <p><span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))\)</span></p>
 <p>First run the actor target network, using the next states as the inputs, and get <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>.
 Next, run the critic target network using the next states and <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>, and use the output to
 calculate <span class="math notranslate nohighlight">\(y_t\)</span> according to the equation above. To train the network, use the current states and actions
 as the inputs, and <span class="math notranslate nohighlight">\(y_t\)</span> as the targets.</p>
 </li>
-<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
+<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
 <p><span class="math notranslate nohighlight">\(\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]\)</span></p>
 <p>Use the actor’s online network to get the action mean values using the current states as the inputs.
 Then, use the critic online network in order to get the gradients of the critic output with respect to the
@@ -255,35 +258,31 @@ given <span class="math notranslate nohighlight">\(\nabla_a Q(s,a)\)</span>. Fin
 <dl class="class">
 <dt id="rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.ddpg_agent.</code><code class="descname">DDPGAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ddpg_agent.html#DDPGAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>rate_for_copying_weights_to_target</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
 When copying the online network weights to the target network weights, a soft update will be used, which
-weight the new online network weights by rate_for_copying_weights_to_target</li>
-<li><strong>num_consecutive_playing_steps</strong> – (StepMethod)
-The number of consecutive steps to act between every two training iterations</li>
-<li><strong>use_target_network_for_evaluation</strong> – (bool)
+weight the new online network weights by rate_for_copying_weights_to_target</p></li>
+<li><p><strong>num_consecutive_playing_steps</strong> – (StepMethod)
+The number of consecutive steps to act between every two training iterations</p></li>
+<li><p><strong>use_target_network_for_evaluation</strong> – (bool)
 If set to True, the target network will be used for predicting the actions when choosing actions to act.
-Since the target network weights change more slowly, the predicted actions will be more consistent.</li>
-<li><strong>action_penalty</strong> – (float)
+Since the target network weights change more slowly, the predicted actions will be more consistent.</p></li>
+<li><p><strong>action_penalty</strong> – (float)
 The amount by which to penalize the network on high action feature (pre-activation) values.
 This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
-gradients from becoming very low.</li>
-<li><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
-The range to clip the critic target to in order to prevent overestimation of the action values.</li>
-<li><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
+gradients from becoming very low.</p></li>
+<li><p><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
+The range to clip the critic target to in order to prevent overestimation of the action values.</p></li>
+<li><p><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
 If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
-values. If set to False, the terminal states reward will be taken as the target return for the network.</li>
+values. If set to False, the terminal states reward will be taken as the target return for the network.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -301,7 +300,7 @@ values. If set to False, the terminal states reward will be taken as the target
        <a href="sac.html" class="btn btn-neutral float-right" title="Soft Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="cppo.html" class="btn btn-neutral" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="cppo.html" class="btn btn-neutral float-left" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -310,7 +309,7 @@ values. If set to False, the terminal states reward will be taken as the target

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -327,27 +326,16 @@ values. If set to False, the terminal states reward will be taken as the target
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/hac.html
+++ b/docs/components/agents/policy_optimization/hac.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Hierarchical Actor Critic &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Hierarchical Actor Critic &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -31,21 +39,16 @@
    <link rel="search" title="Search" href="../../../search.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -212,7 +215,7 @@ to add exploration noise to the action. When testing, use the mean vector <span

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -229,27 +232,16 @@ to add exploration noise to the action. When testing, use the mean vector <span
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/pg.html
+++ b/docs/components/agents/policy_optimization/pg.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Policy Gradient &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Policy Gradient &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Persistent Advantage Learning" href="../value_optimization/pal.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -237,11 +240,11 @@ The <code class="code docutils literal notranslate"><span class="pre">PolicyGrad
 This is done in order to reduce the variance of the updates, since noisy gradient updates might destabilize the policy’s
 convergence. The rescaler is a configurable parameter and there are few options to choose from:</p>
 <ul class="simple">
-<li><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</li>
-<li><strong>Future Return</strong> - Return from each transition until the end of the episode.</li>
-<li><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</li>
-<li><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
-which are calculated seperately for each timestep, across different episodes.</li>
+<li><p><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</p></li>
+<li><p><strong>Future Return</strong> - Return from each transition until the end of the episode.</p></li>
+<li><p><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</p></li>
+<li><p><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
+which are calculated seperately for each timestep, across different episodes.</p></li>
 </ul>
 <p>Gradients are accumulated over a number of full played episodes. The gradients accumulation over several episodes
 serves the same purpose - reducing the update variance. After accumulating gradients for several episodes,
@@ -249,32 +252,28 @@ the gradients are then applied to the network.</p>
 <dl class="class">
 <dt id="rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.policy_gradients_agent.</code><code class="descname">PolicyGradientAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/policy_gradients_agent.html#PolicyGradientAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
 The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
 the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
-return, but there are other rescalers that are intended for reducing the variance of the updates.</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+return, but there are other rescalers that are intended for reducing the variance of the updates.</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes between applying the accumulated gradients to the network. After every
 num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
 it will then accumulate it in internal accumulators, and will only apply them to the network once in every
-apply_gradients_every_x_episodes episodes.</li>
-<li><strong>beta_entropy</strong> – (float)
+apply_gradients_every_x_episodes episodes.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
-will be added to the loss and scaled by the given beta factor.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+will be added to the loss and scaled by the given beta factor.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
 called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
-are used in the batch.</li>
+are used in the batch.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -292,7 +291,7 @@ are used in the batch.</li>
        <a href="ppo.html" class="btn btn-neutral float-right" title="Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../value_optimization/pal.html" class="btn btn-neutral" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../value_optimization/pal.html" class="btn btn-neutral float-left" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -301,7 +300,7 @@ are used in the batch.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -318,27 +317,16 @@ are used in the batch.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/ppo.html
+++ b/docs/components/agents/policy_optimization/ppo.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Proximal Policy Optimization &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Policy Gradient" href="pg.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -234,66 +237,62 @@ When testing, just take the mean values predicted by the network.</p>
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</li>
-<li>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</li>
-<li>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
+<li><p>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</p></li>
+<li><p>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</p></li>
+<li><p>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
 the L-BFGS optimizer runs on the entire dataset at once, without batching.
 It continues running until some low loss threshold is reached. To prevent overfitting to the current dataset,
 the value targets are updated in a soft manner, using an Exponentially Weighted Moving Average, based on the total
-discounted returns of each state in each episode.</li>
-<li>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
+discounted returns of each state in each episode.</p></li>
+<li><p>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
 targets. The loss function penalizes policies that deviate too far from the old policy (the policy that was used <em>before</em>
-starting to run the current set of training iterations) using a regularization term.</li>
-<li>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
+starting to run the current set of training iterations) using a regularization term.</p></li>
+<li><p>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
 in order to adapt the penalty coefficient used in the policy loss. If the KL divergence went too high,
-increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</li>
+increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.ppo_agent.PPOAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.ppo_agent.</code><code class="descname">PPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ppo_agent.html#PPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ppo_agent.PPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
 This represents how the critic will be used to update the actor. The critic value function is typically used
 to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
-advantage of the action, or the generalized advantage estimation (GAE) value.</li>
-<li><strong>gae_lambda</strong> – (float)
+advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
+<li><p><strong>gae_lambda</strong> – (float)
 The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
 estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
-n-step estimations.</li>
-<li><strong>target_kl_divergence</strong> – (float)
+n-step estimations.</p></li>
+<li><p><strong>target_kl_divergence</strong> – (float)
 The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
-bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</li>
-<li><strong>initial_kl_coefficient</strong> – (float)
+bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</p></li>
+<li><p><strong>initial_kl_coefficient</strong> – (float)
 The initial weight that will be given to the KL divergence between the current and the new policy in the
-regularization factor.</li>
-<li><strong>high_kl_penalty_coefficient</strong> – (float)
-The penalty that will be given for KL divergence values which are highes than what was defined as the target.</li>
-<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
+regularization factor.</p></li>
+<li><p><strong>high_kl_penalty_coefficient</strong> – (float)
+The penalty that will be given for KL divergence values which are highes than what was defined as the target.</p></li>
+<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
 If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
 clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
 This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
-implementations.</li>
-<li><strong>value_targets_mix_fraction</strong> – (float)
+implementations.</p></li>
+<li><p><strong>value_targets_mix_fraction</strong> – (float)
 The targets for the value network are an exponential weighted moving average which uses this mix fraction to
 define how much of the new targets will be taken into account when calculating the loss.
-This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
-<li><strong>estimate_state_value_using_gae</strong> – (bool)
-If set to True, the state value will be estimated using the GAE technique.</li>
-<li><strong>use_kl_regularization</strong> – (bool)
+This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
+<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
+If set to True, the state value will be estimated using the GAE technique.</p></li>
+<li><p><strong>use_kl_regularization</strong> – (bool)
 If set to True, the loss function will be regularized using the KL diveregence between the current and new
-policy, to bound the change of the policy during the network update.</li>
-<li><strong>beta_entropy</strong> – (float)
+policy, to bound the change of the policy during the network update.</p></li>
+<li><p><strong>beta_entropy</strong> – (float)
 An entropy regulaization term can be added to the loss function in order to control exploration. This term
-is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
+is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -311,7 +310,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
        <a href="../value_optimization/rainbow.html" class="btn btn-neutral float-right" title="Rainbow" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="pg.html" class="btn btn-neutral" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="pg.html" class="btn btn-neutral float-left" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -320,7 +319,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -337,27 +336,16 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/policy_optimization/sac.html
+++ b/docs/components/agents/policy_optimization/sac.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Soft Actor-Critic &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Soft Actor-Critic &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Deep Deterministic Policy Gradient" href="ddpg.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -235,19 +238,19 @@ by picking the mean value or sample from a gaussian distribution like in trainin
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>Start by sampling a batch <span class="math notranslate nohighlight">\(B\)</span> of transitions from the experience replay.</p>
 <ul>
-<li><p class="first">To train the <strong>Q network</strong>, use the following targets:</p>
+<li><p>To train the <strong>Q network</strong>, use the following targets:</p>
 <div class="math notranslate nohighlight">
 \[y_t^Q=r(s_t,a_t)+\gamma \cdot V(s_{t+1})\]</div>
 <p>The state value used in the above target is acquired by running the target state value network.</p>
 </li>
-<li><p class="first">To train the <strong>State Value network</strong>, use the following targets:</p>
+<li><p>To train the <strong>State Value network</strong>, use the following targets:</p>
 <div class="math notranslate nohighlight">
 \[y_t^V = \min_{i=1,2}Q_i(s_t,\tilde{a}) - log\pi (\tilde{a} \vert s),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
 <p>The state value network is trained using a sample-based approximation of the connection between and state value and state
 action values, The actions used for constructing the target are <strong>not</strong> sampled from the replay buffer, but rather sampled
 from the current policy.</p>
 </li>
-<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
+<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
 <div class="math notranslate nohighlight">
 \[\nabla_{\theta} J \approx \nabla_{\theta} \frac{1}{\vert B \vert} \sum_{s_t\in B} \left( Q \left(s_t, \tilde{a}_\theta(s_t)\right) - log\pi_{\theta}(\tilde{a}_{\theta}(s_t)\vert s_t) \right),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
 </li>
@@ -256,24 +259,20 @@ from the current policy.</p>
 <dl class="class">
 <dt id="rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.soft_actor_critic_agent.</code><code class="descname">SoftActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/soft_actor_critic_agent.html#SoftActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>rate_for_copying_weights_to_target</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
 When copying the online network weights to the target network weights, a soft update will be used, which
-weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</li>
-<li><strong>use_deterministic_for_evaluation</strong> – (bool)
+weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</p></li>
+<li><p><strong>use_deterministic_for_evaluation</strong> – (bool)
 If True, during the evaluation phase, action are chosen deterministically according to the policy mean
-and not sampled from the policy distribution.</li>
+and not sampled from the policy distribution.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -291,7 +290,7 @@ and not sampled from the policy distribution.</li>
        <a href="../other/dfp.html" class="btn btn-neutral float-right" title="Direct Future Prediction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="ddpg.html" class="btn btn-neutral" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="ddpg.html" class="btn btn-neutral float-left" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -300,7 +299,7 @@ and not sampled from the policy distribution.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -317,27 +316,16 @@ and not sampled from the policy distribution.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/bs_dqn.html
+++ b/docs/components/agents/value_optimization/bs_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Bootstrapped DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Bootstrapped DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Behavioral Cloning" href="../imitation/bc.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -265,7 +268,7 @@ Then, train the online network according to the calculated targets.</p>
        <a href="categorical_dqn.html" class="btn btn-neutral float-right" title="Categorical DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../imitation/bc.html" class="btn btn-neutral" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../imitation/bc.html" class="btn btn-neutral float-left" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -274,7 +277,7 @@ Then, train the online network according to the calculated targets.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -291,27 +294,16 @@ Then, train the online network according to the calculated targets.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/categorical_dqn.html
+++ b/docs/components/agents/value_optimization/categorical_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Categorical DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Categorical DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Bootstrapped DQN" href="bs_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,43 +230,36 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic">
-<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
-</li>
-<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
 that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
 <p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
 <p>where:
 *  <span class="math notranslate nohighlight">\([ \cdot ]\)</span> bounds its argument in the range <span class="math notranslate nohighlight">\([a, b]\)</span>
 *  <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r+\gamma z_j\)</span></p>
 </li>
-<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
-probability distribution.   Only the target of the actions that were actually taken is updated.</p>
-</li>
-<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
-</li>
+<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
+probability distribution.   Only the target of the actions that were actually taken is updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.categorical_dqn_agent.</code><code class="descname">CategoricalDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/categorical_dqn_agent.html#CategoricalDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>v_min</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>v_min</strong> – (float)
 The minimal value that will be represented in the network output for predicting the Q value.
-Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</li>
-<li><strong>v_max</strong> – (float)
+Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</p></li>
+<li><p><strong>v_max</strong> – (float)
 The maximum value that will be represented in the network output for predicting the Q value.
-Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</li>
-<li><strong>atoms</strong> – (int)
+Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</p></li>
+<li><p><strong>atoms</strong> – (int)
 The number of atoms that will be used to discretize the range between v_min and v_max.
-For the C51 algorithm described in the paper, the number of atoms is 51.</li>
+For the C51 algorithm described in the paper, the number of atoms is 51.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -281,7 +277,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
        <a href="../imitation/cil.html" class="btn btn-neutral float-right" title="Conditional Imitation Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="bs_dqn.html" class="btn btn-neutral" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="bs_dqn.html" class="btn btn-neutral float-left" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -290,7 +286,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -307,27 +303,16 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/double_dqn.html
+++ b/docs/components/agents/value_optimization/double_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Double DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Double DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Direct Future Prediction" href="../other/dfp.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,17 +230,17 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
 action <span class="math notranslate nohighlight">\(argmax_a Q(s_{t+1},a)\)</span>. For these actions, use the corresponding next states and run the target
-network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</li>
-<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
+network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</p></li>
+<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
 use the current states from the sampled batch, and run the online network to get the current Q values predictions.
-Set those values as the targets for the actions that were not actually played.</li>
-<li>For each action that was played, use the following equation for calculating the targets of the network:
-<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
-<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+Set those values as the targets for the actions that were not actually played.</p></li>
+<li><p>For each action that was played, use the following equation for calculating the targets of the network:
+<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
+<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 </div>
 </div>
@@ -254,7 +257,7 @@ Set those values as the targets for the actions that were not actually played.</
        <a href="dqn.html" class="btn btn-neutral float-right" title="Deep Q Networks" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../other/dfp.html" class="btn btn-neutral" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../other/dfp.html" class="btn btn-neutral float-left" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -263,7 +266,7 @@ Set those values as the targets for the actions that were not actually played.</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -280,27 +283,16 @@ Set those values as the targets for the actions that were not actually played.</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/dqn.html
+++ b/docs/components/agents/value_optimization/dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Deep Q Networks &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Deep Q Networks &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Double DQN" href="double_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,16 +230,16 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
-the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</li>
-<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
+the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</p></li>
+<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
 use the current states from the sampled batch, and run the online network to get the current Q values predictions.
-Set those values as the targets for the actions that were not actually played.</li>
-<li>For each action that was played, use the following equation for calculating the targets of the network:
-<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></li>
-<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+Set those values as the targets for the actions that were not actually played.</p></li>
+<li><p>For each action that was played, use the following equation for calculating the targets of the network:
+<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></p></li>
+<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.dqn_agent.DQNAlgorithmParameters">
@@ -258,7 +261,7 @@ Set those values as the targets for the actions that were not actually played.</
        <a href="dueling_dqn.html" class="btn btn-neutral float-right" title="Dueling DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="double_dqn.html" class="btn btn-neutral" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="double_dqn.html" class="btn btn-neutral float-left" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -267,7 +270,7 @@ Set those values as the targets for the actions that were not actually played.</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -284,27 +287,16 @@ Set those values as the targets for the actions that were not actually played.</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/dueling_dqn.html
+++ b/docs/components/agents/value_optimization/dueling_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Dueling DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Dueling DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Deep Q Networks" href="dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -245,7 +248,7 @@ single action has been taken at this state.</p>
        <a href="mmc.html" class="btn btn-neutral float-right" title="Mixed Monte Carlo" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="dqn.html" class="btn btn-neutral" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="dqn.html" class="btn btn-neutral float-left" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -254,7 +257,7 @@ single action has been taken at this state.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -271,27 +274,16 @@ single action has been taken at this state.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/mmc.html
+++ b/docs/components/agents/value_optimization/mmc.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Mixed Monte Carlo &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Mixed Monte Carlo &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Dueling DQN" href="dueling_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -238,16 +241,13 @@ Once in every few thousand steps, copy the weights from the online network to th
 <dl class="class">
 <dt id="rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.mmc_agent.</code><code class="descname">MixedMonteCarloAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/mmc_agent.html#MixedMonteCarloAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>monte_carlo_mixing_rate</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>monte_carlo_mixing_rate</strong> – (float)
 The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
-the single-step bootstrapped targets.</td>
-</tr>
-</tbody>
-</table>
+the single-step bootstrapped targets.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -265,7 +265,7 @@ the single-step bootstrapped targets.</td>
        <a href="n_step.html" class="btn btn-neutral float-right" title="N-Step Q Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="dueling_dqn.html" class="btn btn-neutral" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="dueling_dqn.html" class="btn btn-neutral float-left" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -274,7 +274,7 @@ the single-step bootstrapped targets.</td>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -291,27 +291,16 @@ the single-step bootstrapped targets.</td>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/n_step.html
+++ b/docs/components/agents/value_optimization/n_step.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>N-Step Q Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>N-Step Q Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Mixed Monte Carlo" href="mmc.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -228,43 +231,39 @@
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>The <span class="math notranslate nohighlight">\(N\)</span>-step Q learning algorithm works in similar manner to DQN except for the following changes:</p>
 <ol class="arabic simple">
-<li>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
-<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</li>
-<li>In order to stabilize the learning, multiple workers work together to update the network.
-This creates the same effect as uncorrelating the samples used for training.</li>
-<li>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
+<li><p>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
+<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</p></li>
+<li><p>In order to stabilize the learning, multiple workers work together to update the network.
+This creates the same effect as uncorrelating the samples used for training.</p></li>
+<li><p>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
 to form the <span class="math notranslate nohighlight">\(N\)</span>-step Q targets, according to the following equation:
 <span class="math notranslate nohighlight">\(R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})\)</span>
-where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</li>
+where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.n_step_q_agent.</code><code class="descname">NStepQAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/n_step_q_agent.html#NStepQAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes between applying the accumulated gradients to the network. After every
 num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
 it will then accumulate it in internal accumulators, and will only apply them to the network once in every
-apply_gradients_every_x_episodes episodes.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+apply_gradients_every_x_episodes episodes.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
 called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
-are used in the batch.</li>
-<li><strong>targets_horizon</strong> – (str)
+are used in the batch.</p></li>
+<li><p><strong>targets_horizon</strong> – (str)
 Should be either ‘N-Step’ or ‘1-Step’, and defines the length for which to bootstrap the network values over.
 Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
-please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</li>
+please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -282,7 +281,7 @@ please refer to the original paper (<a class="reference external" href="https://
        <a href="naf.html" class="btn btn-neutral float-right" title="Normalized Advantage Functions" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="mmc.html" class="btn btn-neutral" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="mmc.html" class="btn btn-neutral float-left" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -291,7 +290,7 @@ please refer to the original paper (<a class="reference external" href="https://

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -308,27 +307,16 @@ please refer to the original paper (<a class="reference external" href="https://
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/naf.html
+++ b/docs/components/agents/value_optimization/naf.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Normalized Advantage Functions &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Normalized Advantage Functions &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="N-Step Q Learning" href="n_step.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -258,7 +261,7 @@ After every training step, use a soft update in order to copy the weights from t
        <a href="nec.html" class="btn btn-neutral float-right" title="Neural Episodic Control" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="n_step.html" class="btn btn-neutral" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="n_step.html" class="btn btn-neutral float-left" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -267,7 +270,7 @@ After every training step, use a soft update in order to copy the weights from t

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -284,27 +287,16 @@ After every training step, use a soft update in order to copy the weights from t
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/nec.html
+++ b/docs/components/agents/value_optimization/nec.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Neural Episodic Control &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Neural Episodic Control &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Normalized Advantage Functions" href="naf.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -229,14 +232,14 @@
 <div class="section" id="choosing-an-action">
 <h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
-output from the middleware.</li>
-<li>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
+<li><p>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
+output from the middleware.</p></li>
+<li><p>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
 The DND is queried and returns the <span class="math notranslate nohighlight">\(P\)</span> nearest neighbor keys and values. The keys and values are used to calculate
-and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</li>
-<li>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</li>
-<li>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
-accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</li>
+and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</p></li>
+<li><p>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</p></li>
+<li><p>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
+accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</p></li>
 </ol>
 </div>
 <div class="section" id="finalizing-an-episode">
@@ -256,40 +259,36 @@ the network if necessary:
 <dl class="class">
 <dt id="rl_coach.agents.nec_agent.NECAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.nec_agent.</code><code class="descname">NECAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/nec_agent.html#NECAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.nec_agent.NECAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>dnd_size</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dnd_size</strong> – (int)
 Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
-of transitions that will be stored is dnd_size x num_actions.</li>
-<li><strong>l2_norm_added_delta</strong> – (float)
+of transitions that will be stored is dnd_size x num_actions.</p></li>
+<li><p><strong>l2_norm_added_delta</strong> – (float)
 A small value that will be added when calculating the weight of each of the DND entries. This follows the
-<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</li>
-<li><strong>new_value_shift_coefficient</strong> – (float)
+<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</p></li>
+<li><p><strong>new_value_shift_coefficient</strong> – (float)
 In the case where a ew embedding that was added to the DND was already present, the value that will be stored
 in the DND is a mix between the existing value and the new value. The mix rate is defined by
-new_value_shift_coefficient.</li>
-<li><strong>number_of_knn</strong> – (int)
-The number of neighbors that will be retrieved for each DND query.</li>
-<li><strong>DND_key_error_threshold</strong> – (float)
+new_value_shift_coefficient.</p></li>
+<li><p><strong>number_of_knn</strong> – (int)
+The number of neighbors that will be retrieved for each DND query.</p></li>
+<li><p><strong>DND_key_error_threshold</strong> – (float)
 When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
-exists in the DND, since exact matches of embeddings are very rare.</li>
-<li><strong>propagate_updates_to_DND</strong> – (bool)
+exists in the DND, since exact matches of embeddings are very rare.</p></li>
+<li><p><strong>propagate_updates_to_DND</strong> – (bool)
 If set to True, when the gradients of the network will be calculated, the gradients will also be
 backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
-network weights.</li>
-<li><strong>n_step</strong> – (int)
-The bootstrap length that will be used when calculating the state values to store in the DND.</li>
-<li><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
+network weights.</p></li>
+<li><p><strong>n_step</strong> – (int)
+The bootstrap length that will be used when calculating the state values to store in the DND.</p></li>
+<li><p><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
 If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
-when the state was first seen, and not the latest, most up-to-date network value.</li>
+when the state was first seen, and not the latest, most up-to-date network value.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -307,7 +306,7 @@ when the state was first seen, and not the latest, most up-to-date network value
        <a href="pal.html" class="btn btn-neutral float-right" title="Persistent Advantage Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="naf.html" class="btn btn-neutral" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="naf.html" class="btn btn-neutral float-left" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -316,7 +315,7 @@ when the state was first seen, and not the latest, most up-to-date network value

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -333,27 +332,16 @@ when the state was first seen, and not the latest, most up-to-date network value
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/pal.html
+++ b/docs/components/agents/value_optimization/pal.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Persistent Advantage Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Persistent Advantage Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Neural Episodic Control" href="nec.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,47 +230,43 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Start by calculating the initial target values in the same manner as they are calculated in DDQN
-<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
-<li>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Start by calculating the initial target values in the same manner as they are calculated in DDQN
+<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
+<li><p>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
 To calculate the action gap, run the target network using the current states and get the <span class="math notranslate nohighlight">\(Q\)</span> values
 for all the actions. Then estimate <span class="math notranslate nohighlight">\(V\)</span> as the maximum predicted <span class="math notranslate nohighlight">\(Q\)</span> value for the current state:
-<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></li>
-<li>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
+<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></p></li>
+<li><p>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
 the targets <span class="math notranslate nohighlight">\(y_t^{DDQN}\)</span>:
-<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></li>
-<li>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
+<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></p></li>
+<li><p>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
 gap for the next state:
 <span class="math notranslate nohighlight">\(V(s_{t+1} )-Q(s_{t+1},a_{t+1})\)</span>
 where <span class="math notranslate nohighlight">\(a_{t+1}\)</span> is chosen by running the next states through the online network and choosing the action that
 has the highest predicted <span class="math notranslate nohighlight">\(Q\)</span> value. Finally, the targets will be defined as -
-<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></li>
-<li>Train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></p></li>
+<li><p>Train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.pal_agent.PALAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.pal_agent.</code><code class="descname">PALAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/pal_agent.html#PALAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.pal_agent.PALAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>pal_alpha</strong> – (float)
-A factor that weights the amount by which the advantage learning update will be taken into account.</li>
-<li><strong>persistent_advantage_learning</strong> – (bool)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>pal_alpha</strong> – (float)
+A factor that weights the amount by which the advantage learning update will be taken into account.</p></li>
+<li><p><strong>persistent_advantage_learning</strong> – (bool)
 If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
-the same actions one after the other instead of changing actions.</li>
-<li><strong>monte_carlo_mixing_rate</strong> – (float)
+the same actions one after the other instead of changing actions.</p></li>
+<li><p><strong>monte_carlo_mixing_rate</strong> – (float)
 The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
 total discounted returns, and they can help reduce the time it takes for the network to update to the newly
-seen values, since it is not based on bootstrapping the current network values.</li>
+seen values, since it is not based on bootstrapping the current network values.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -285,7 +284,7 @@ seen values, since it is not based on bootstrapping the current network values.<
        <a href="../policy_optimization/pg.html" class="btn btn-neutral float-right" title="Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="nec.html" class="btn btn-neutral" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="nec.html" class="btn btn-neutral float-left" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -294,7 +293,7 @@ seen values, since it is not based on bootstrapping the current network values.<

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -311,27 +310,16 @@ seen values, since it is not based on bootstrapping the current network values.<
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/qr_dqn.html
+++ b/docs/components/agents/value_optimization/qr_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Quantile Regression DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Quantile Regression DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Rainbow" href="rainbow.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,33 +230,29 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
 by following the Bellman equation.
 Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
-quantile midpoints targets.</li>
-<li>The network is trained with the quantile regression loss between the resulting quantile locations and the target
-quantile locations. Only the targets of the actions that were actually taken are updated.</li>
-<li>Once in every few thousand steps, weights are copied from the online network to the target network.</li>
+quantile midpoints targets.</p></li>
+<li><p>The network is trained with the quantile regression loss between the resulting quantile locations and the target
+quantile locations. Only the targets of the actions that were actually taken are updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.qr_dqn_agent.</code><code class="descname">QuantileRegressionDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/qr_dqn_agent.html#QuantileRegressionDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>atoms</strong> – (int)
-the number of atoms to predict for each action</li>
-<li><strong>huber_loss_interval</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>atoms</strong> – (int)
+the number of atoms to predict for each action</p></li>
+<li><p><strong>huber_loss_interval</strong> – (float)
 One of the huber loss parameters, and is referred to as <span class="math notranslate nohighlight">\(\kapa\)</span> in the paper.
-It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</li>
+It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -271,7 +270,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
        <a href="../../architectures/index.html" class="btn btn-neutral float-right" title="Architectures" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="rainbow.html" class="btn btn-neutral" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="rainbow.html" class="btn btn-neutral float-left" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -280,7 +279,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -297,27 +296,16 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/rainbow.html
+++ b/docs/components/agents/value_optimization/rainbow.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Rainbow &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Rainbow &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Proximal Policy Optimization" href="../policy_optimization/ppo.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -226,19 +229,18 @@
 <h2>Algorithm Description<a class="headerlink" href="#algorithm-description" title="Permalink to this headline">¶</a></h2>
 <p>Rainbow combines 6 recent advancements in reinforcement learning:</p>
 <ul class="simple">
-<li>N-step returns</li>
-<li>Distributional state-action value learning</li>
-<li>Dueling networks</li>
-<li>Noisy Networks</li>
-<li>Double DQN</li>
-<li>Prioritized Experience Replay</li>
+<li><p>N-step returns</p></li>
+<li><p>Distributional state-action value learning</p></li>
+<li><p>Dueling networks</p></li>
+<li><p>Noisy Networks</p></li>
+<li><p>Double DQN</p></li>
+<li><p>Prioritized Experience Replay</p></li>
 </ul>
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic">
-<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
-</li>
-<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
 that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
 <p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
 <p>where:
@@ -246,36 +248,29 @@ that the <span class="math notranslate nohighlight">\(i-th\)</span> component of
 *  <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom
 <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r_t+\gamma r_{t+1} + ... + \gamma r_{t+n-1} + \gamma^{n-1} z_j\)</span></p>
 </li>
-<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
-probability distribution.   Only the target of the actions that were actually taken is updated.</p>
-</li>
-<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
-</li>
-<li><p class="first">After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
-using the KL divergence loss that is returned from the network.</p>
-</li>
+<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
+probability distribution.   Only the target of the actions that were actually taken is updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
+<li><p>After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
+using the KL divergence loss that is returned from the network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.rainbow_dqn_agent.</code><code class="descname">RainbowDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/rainbow_dqn_agent.html#RainbowDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>n_step</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>n_step</strong> – (int)
 The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
 using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
-prediction.</li>
-<li><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
+prediction.</p></li>
+<li><p><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
 If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
 written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
-transitions into the memory, and to do so we need the entire episode first.</li>
+transitions into the memory, and to do so we need the entire episode first.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -293,7 +288,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
        <a href="qr_dqn.html" class="btn btn-neutral float-right" title="Quantile Regression DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/ppo.html" class="btn btn-neutral" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/ppo.html" class="btn btn-neutral float-left" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -302,7 +297,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -319,27 +314,16 @@ transitions into the memory, and to do so we need the entire episode first.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>