Enabling Coach Documentation to be run even when environments are not installed (#326)

2026-03-19 08:23:33 +01:00 · 2019-05-27 10:46:07 +03:00
parent 2b7d536da4
commit 342b7184bc
157 changed files with 5167 additions and 7477 deletions
--- a/docs/components/agents/value_optimization/bs_dqn.html
+++ b/docs/components/agents/value_optimization/bs_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Bootstrapped DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Bootstrapped DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Behavioral Cloning" href="../imitation/bc.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -265,7 +268,7 @@ Then, train the online network according to the calculated targets.</p>
        <a href="categorical_dqn.html" class="btn btn-neutral float-right" title="Categorical DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../imitation/bc.html" class="btn btn-neutral" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../imitation/bc.html" class="btn btn-neutral float-left" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -274,7 +277,7 @@ Then, train the online network according to the calculated targets.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -291,27 +294,16 @@ Then, train the online network according to the calculated targets.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/categorical_dqn.html
+++ b/docs/components/agents/value_optimization/categorical_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Categorical DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Categorical DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Bootstrapped DQN" href="bs_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,43 +230,36 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic">
-<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
-</li>
-<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
 that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
 <p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
 <p>where:
 *  <span class="math notranslate nohighlight">\([ \cdot ]\)</span> bounds its argument in the range <span class="math notranslate nohighlight">\([a, b]\)</span>
 *  <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r+\gamma z_j\)</span></p>
 </li>
-<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
-probability distribution.   Only the target of the actions that were actually taken is updated.</p>
-</li>
-<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
-</li>
+<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
+probability distribution.   Only the target of the actions that were actually taken is updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.categorical_dqn_agent.</code><code class="descname">CategoricalDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/categorical_dqn_agent.html#CategoricalDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>v_min</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>v_min</strong> – (float)
 The minimal value that will be represented in the network output for predicting the Q value.
-Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</li>
-<li><strong>v_max</strong> – (float)
+Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</p></li>
+<li><p><strong>v_max</strong> – (float)
 The maximum value that will be represented in the network output for predicting the Q value.
-Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</li>
-<li><strong>atoms</strong> – (int)
+Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</p></li>
+<li><p><strong>atoms</strong> – (int)
 The number of atoms that will be used to discretize the range between v_min and v_max.
-For the C51 algorithm described in the paper, the number of atoms is 51.</li>
+For the C51 algorithm described in the paper, the number of atoms is 51.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -281,7 +277,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
        <a href="../imitation/cil.html" class="btn btn-neutral float-right" title="Conditional Imitation Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="bs_dqn.html" class="btn btn-neutral" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="bs_dqn.html" class="btn btn-neutral float-left" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -290,7 +286,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -307,27 +303,16 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/double_dqn.html
+++ b/docs/components/agents/value_optimization/double_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Double DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Double DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Direct Future Prediction" href="../other/dfp.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,17 +230,17 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
 action <span class="math notranslate nohighlight">\(argmax_a Q(s_{t+1},a)\)</span>. For these actions, use the corresponding next states and run the target
-network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</li>
-<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
+network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</p></li>
+<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
 use the current states from the sampled batch, and run the online network to get the current Q values predictions.
-Set those values as the targets for the actions that were not actually played.</li>
-<li>For each action that was played, use the following equation for calculating the targets of the network:
-<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
-<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+Set those values as the targets for the actions that were not actually played.</p></li>
+<li><p>For each action that was played, use the following equation for calculating the targets of the network:
+<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
+<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 </div>
 </div>
@@ -254,7 +257,7 @@ Set those values as the targets for the actions that were not actually played.</
        <a href="dqn.html" class="btn btn-neutral float-right" title="Deep Q Networks" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../other/dfp.html" class="btn btn-neutral" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../other/dfp.html" class="btn btn-neutral float-left" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -263,7 +266,7 @@ Set those values as the targets for the actions that were not actually played.</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -280,27 +283,16 @@ Set those values as the targets for the actions that were not actually played.</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/dqn.html
+++ b/docs/components/agents/value_optimization/dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Deep Q Networks &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Deep Q Networks &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Double DQN" href="double_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,16 +230,16 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
-the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</li>
-<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
+the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</p></li>
+<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
 use the current states from the sampled batch, and run the online network to get the current Q values predictions.
-Set those values as the targets for the actions that were not actually played.</li>
-<li>For each action that was played, use the following equation for calculating the targets of the network:
-<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></li>
-<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+Set those values as the targets for the actions that were not actually played.</p></li>
+<li><p>For each action that was played, use the following equation for calculating the targets of the network:
+<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></p></li>
+<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.dqn_agent.DQNAlgorithmParameters">
@@ -258,7 +261,7 @@ Set those values as the targets for the actions that were not actually played.</
        <a href="dueling_dqn.html" class="btn btn-neutral float-right" title="Dueling DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="double_dqn.html" class="btn btn-neutral" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="double_dqn.html" class="btn btn-neutral float-left" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -267,7 +270,7 @@ Set those values as the targets for the actions that were not actually played.</

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -284,27 +287,16 @@ Set those values as the targets for the actions that were not actually played.</
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/dueling_dqn.html
+++ b/docs/components/agents/value_optimization/dueling_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Dueling DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Dueling DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Deep Q Networks" href="dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -245,7 +248,7 @@ single action has been taken at this state.</p>
        <a href="mmc.html" class="btn btn-neutral float-right" title="Mixed Monte Carlo" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="dqn.html" class="btn btn-neutral" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="dqn.html" class="btn btn-neutral float-left" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -254,7 +257,7 @@ single action has been taken at this state.</p>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -271,27 +274,16 @@ single action has been taken at this state.</p>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/mmc.html
+++ b/docs/components/agents/value_optimization/mmc.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Mixed Monte Carlo &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Mixed Monte Carlo &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Dueling DQN" href="dueling_dqn.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -238,16 +241,13 @@ Once in every few thousand steps, copy the weights from the online network to th
 <dl class="class">
 <dt id="rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.mmc_agent.</code><code class="descname">MixedMonteCarloAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/mmc_agent.html#MixedMonteCarloAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>monte_carlo_mixing_rate</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>monte_carlo_mixing_rate</strong> – (float)
 The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
-the single-step bootstrapped targets.</td>
-</tr>
-</tbody>
-</table>
+the single-step bootstrapped targets.</p>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -265,7 +265,7 @@ the single-step bootstrapped targets.</td>
        <a href="n_step.html" class="btn btn-neutral float-right" title="N-Step Q Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="dueling_dqn.html" class="btn btn-neutral" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="dueling_dqn.html" class="btn btn-neutral float-left" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -274,7 +274,7 @@ the single-step bootstrapped targets.</td>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -291,27 +291,16 @@ the single-step bootstrapped targets.</td>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/n_step.html
+++ b/docs/components/agents/value_optimization/n_step.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>N-Step Q Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>N-Step Q Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Mixed Monte Carlo" href="mmc.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -228,43 +231,39 @@
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <p>The <span class="math notranslate nohighlight">\(N\)</span>-step Q learning algorithm works in similar manner to DQN except for the following changes:</p>
 <ol class="arabic simple">
-<li>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
-<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</li>
-<li>In order to stabilize the learning, multiple workers work together to update the network.
-This creates the same effect as uncorrelating the samples used for training.</li>
-<li>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
+<li><p>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
+<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</p></li>
+<li><p>In order to stabilize the learning, multiple workers work together to update the network.
+This creates the same effect as uncorrelating the samples used for training.</p></li>
+<li><p>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
 to form the <span class="math notranslate nohighlight">\(N\)</span>-step Q targets, according to the following equation:
 <span class="math notranslate nohighlight">\(R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})\)</span>
-where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</li>
+where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.n_step_q_agent.</code><code class="descname">NStepQAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/n_step_q_agent.html#NStepQAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
-The number of steps between copying the online network weights to the target network weights.</li>
-<li><strong>apply_gradients_every_x_episodes</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.</p></li>
+<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
 The number of episodes between applying the accumulated gradients to the network. After every
 num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
 it will then accumulate it in internal accumulators, and will only apply them to the network once in every
-apply_gradients_every_x_episodes episodes.</li>
-<li><strong>num_steps_between_gradient_updates</strong> – (int)
+apply_gradients_every_x_episodes episodes.</p></li>
+<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
 The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
 called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
-are used in the batch.</li>
-<li><strong>targets_horizon</strong> – (str)
+are used in the batch.</p></li>
+<li><p><strong>targets_horizon</strong> – (str)
 Should be either ‘N-Step’ or ‘1-Step’, and defines the length for which to bootstrap the network values over.
 Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
-please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</li>
+please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -282,7 +281,7 @@ please refer to the original paper (<a class="reference external" href="https://
        <a href="naf.html" class="btn btn-neutral float-right" title="Normalized Advantage Functions" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="mmc.html" class="btn btn-neutral" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="mmc.html" class="btn btn-neutral float-left" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -291,7 +290,7 @@ please refer to the original paper (<a class="reference external" href="https://

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -308,27 +307,16 @@ please refer to the original paper (<a class="reference external" href="https://
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/naf.html
+++ b/docs/components/agents/value_optimization/naf.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Normalized Advantage Functions &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Normalized Advantage Functions &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="N-Step Q Learning" href="n_step.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -258,7 +261,7 @@ After every training step, use a soft update in order to copy the weights from t
        <a href="nec.html" class="btn btn-neutral float-right" title="Neural Episodic Control" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="n_step.html" class="btn btn-neutral" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="n_step.html" class="btn btn-neutral float-left" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -267,7 +270,7 @@ After every training step, use a soft update in order to copy the weights from t

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -284,27 +287,16 @@ After every training step, use a soft update in order to copy the weights from t
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/nec.html
+++ b/docs/components/agents/value_optimization/nec.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Neural Episodic Control &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Neural Episodic Control &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Normalized Advantage Functions" href="naf.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -229,14 +232,14 @@
 <div class="section" id="choosing-an-action">
 <h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
-output from the middleware.</li>
-<li>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
+<li><p>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
+output from the middleware.</p></li>
+<li><p>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
 The DND is queried and returns the <span class="math notranslate nohighlight">\(P\)</span> nearest neighbor keys and values. The keys and values are used to calculate
-and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</li>
-<li>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</li>
-<li>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
-accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</li>
+and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</p></li>
+<li><p>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</p></li>
+<li><p>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
+accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</p></li>
 </ol>
 </div>
 <div class="section" id="finalizing-an-episode">
@@ -256,40 +259,36 @@ the network if necessary:
 <dl class="class">
 <dt id="rl_coach.agents.nec_agent.NECAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.nec_agent.</code><code class="descname">NECAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/nec_agent.html#NECAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.nec_agent.NECAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>dnd_size</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dnd_size</strong> – (int)
 Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
-of transitions that will be stored is dnd_size x num_actions.</li>
-<li><strong>l2_norm_added_delta</strong> – (float)
+of transitions that will be stored is dnd_size x num_actions.</p></li>
+<li><p><strong>l2_norm_added_delta</strong> – (float)
 A small value that will be added when calculating the weight of each of the DND entries. This follows the
-<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</li>
-<li><strong>new_value_shift_coefficient</strong> – (float)
+<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</p></li>
+<li><p><strong>new_value_shift_coefficient</strong> – (float)
 In the case where a ew embedding that was added to the DND was already present, the value that will be stored
 in the DND is a mix between the existing value and the new value. The mix rate is defined by
-new_value_shift_coefficient.</li>
-<li><strong>number_of_knn</strong> – (int)
-The number of neighbors that will be retrieved for each DND query.</li>
-<li><strong>DND_key_error_threshold</strong> – (float)
+new_value_shift_coefficient.</p></li>
+<li><p><strong>number_of_knn</strong> – (int)
+The number of neighbors that will be retrieved for each DND query.</p></li>
+<li><p><strong>DND_key_error_threshold</strong> – (float)
 When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
-exists in the DND, since exact matches of embeddings are very rare.</li>
-<li><strong>propagate_updates_to_DND</strong> – (bool)
+exists in the DND, since exact matches of embeddings are very rare.</p></li>
+<li><p><strong>propagate_updates_to_DND</strong> – (bool)
 If set to True, when the gradients of the network will be calculated, the gradients will also be
 backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
-network weights.</li>
-<li><strong>n_step</strong> – (int)
-The bootstrap length that will be used when calculating the state values to store in the DND.</li>
-<li><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
+network weights.</p></li>
+<li><p><strong>n_step</strong> – (int)
+The bootstrap length that will be used when calculating the state values to store in the DND.</p></li>
+<li><p><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
 If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
-when the state was first seen, and not the latest, most up-to-date network value.</li>
+when the state was first seen, and not the latest, most up-to-date network value.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -307,7 +306,7 @@ when the state was first seen, and not the latest, most up-to-date network value
        <a href="pal.html" class="btn btn-neutral float-right" title="Persistent Advantage Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="naf.html" class="btn btn-neutral" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="naf.html" class="btn btn-neutral float-left" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -316,7 +315,7 @@ when the state was first seen, and not the latest, most up-to-date network value

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -333,27 +332,16 @@ when the state was first seen, and not the latest, most up-to-date network value
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/pal.html
+++ b/docs/components/agents/value_optimization/pal.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Persistent Advantage Learning &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Persistent Advantage Learning &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Neural Episodic Control" href="nec.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,47 +230,43 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>Start by calculating the initial target values in the same manner as they are calculated in DDQN
-<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
-<li>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>Start by calculating the initial target values in the same manner as they are calculated in DDQN
+<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
+<li><p>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
 To calculate the action gap, run the target network using the current states and get the <span class="math notranslate nohighlight">\(Q\)</span> values
 for all the actions. Then estimate <span class="math notranslate nohighlight">\(V\)</span> as the maximum predicted <span class="math notranslate nohighlight">\(Q\)</span> value for the current state:
-<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></li>
-<li>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
+<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></p></li>
+<li><p>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
 the targets <span class="math notranslate nohighlight">\(y_t^{DDQN}\)</span>:
-<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></li>
-<li>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
+<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></p></li>
+<li><p>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
 gap for the next state:
 <span class="math notranslate nohighlight">\(V(s_{t+1} )-Q(s_{t+1},a_{t+1})\)</span>
 where <span class="math notranslate nohighlight">\(a_{t+1}\)</span> is chosen by running the next states through the online network and choosing the action that
 has the highest predicted <span class="math notranslate nohighlight">\(Q\)</span> value. Finally, the targets will be defined as -
-<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></li>
-<li>Train the online network using the current states as inputs, and with the aforementioned targets.</li>
-<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
+<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></p></li>
+<li><p>Train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
+<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.pal_agent.PALAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.pal_agent.</code><code class="descname">PALAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/pal_agent.html#PALAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.pal_agent.PALAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>pal_alpha</strong> – (float)
-A factor that weights the amount by which the advantage learning update will be taken into account.</li>
-<li><strong>persistent_advantage_learning</strong> – (bool)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>pal_alpha</strong> – (float)
+A factor that weights the amount by which the advantage learning update will be taken into account.</p></li>
+<li><p><strong>persistent_advantage_learning</strong> – (bool)
 If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
-the same actions one after the other instead of changing actions.</li>
-<li><strong>monte_carlo_mixing_rate</strong> – (float)
+the same actions one after the other instead of changing actions.</p></li>
+<li><p><strong>monte_carlo_mixing_rate</strong> – (float)
 The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
 total discounted returns, and they can help reduce the time it takes for the network to update to the newly
-seen values, since it is not based on bootstrapping the current network values.</li>
+seen values, since it is not based on bootstrapping the current network values.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -285,7 +284,7 @@ seen values, since it is not based on bootstrapping the current network values.<
        <a href="../policy_optimization/pg.html" class="btn btn-neutral float-right" title="Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="nec.html" class="btn btn-neutral" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="nec.html" class="btn btn-neutral float-left" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -294,7 +293,7 @@ seen values, since it is not based on bootstrapping the current network values.<

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -311,27 +310,16 @@ seen values, since it is not based on bootstrapping the current network values.<
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/qr_dqn.html
+++ b/docs/components/agents/value_optimization/qr_dqn.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Quantile Regression DQN &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Quantile Regression DQN &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Rainbow" href="rainbow.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -227,33 +230,29 @@
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic simple">
-<li>Sample a batch of transitions from the replay buffer.</li>
-<li>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
 by following the Bellman equation.
 Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
-quantile midpoints targets.</li>
-<li>The network is trained with the quantile regression loss between the resulting quantile locations and the target
-quantile locations. Only the targets of the actions that were actually taken are updated.</li>
-<li>Once in every few thousand steps, weights are copied from the online network to the target network.</li>
+quantile midpoints targets.</p></li>
+<li><p>The network is trained with the quantile regression loss between the resulting quantile locations and the target
+quantile locations. Only the targets of the actions that were actually taken are updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.qr_dqn_agent.</code><code class="descname">QuantileRegressionDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/qr_dqn_agent.html#QuantileRegressionDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>atoms</strong> – (int)
-the number of atoms to predict for each action</li>
-<li><strong>huber_loss_interval</strong> – (float)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>atoms</strong> – (int)
+the number of atoms to predict for each action</p></li>
+<li><p><strong>huber_loss_interval</strong> – (float)
 One of the huber loss parameters, and is referred to as <span class="math notranslate nohighlight">\(\kapa\)</span> in the paper.
-It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</li>
+It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -271,7 +270,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
        <a href="../../architectures/index.html" class="btn btn-neutral float-right" title="Architectures" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="rainbow.html" class="btn btn-neutral" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="rainbow.html" class="btn btn-neutral float-left" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -280,7 +279,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -297,27 +296,16 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>
--- a/docs/components/agents/value_optimization/rainbow.html
+++ b/docs/components/agents/value_optimization/rainbow.html
@@ -8,7 +8,7 @@
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
-  <title>Rainbow &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>
+  <title>Rainbow &mdash; Reinforcement Learning Coach 0.12.1 documentation</title>
  

  
@@ -17,13 +17,21 @@
  

  
+  <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
+        <script type="text/javascript" src="../../../_static/jquery.js"></script>
+        <script type="text/javascript" src="../../../_static/underscore.js"></script>
+        <script type="text/javascript" src="../../../_static/doctools.js"></script>
+        <script type="text/javascript" src="../../../_static/language_data.js"></script>
+        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
+    
+    <script type="text/javascript" src="../../../_static/js/theme.js"></script>

-  
-  
    

  
-
  <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
@@ -33,21 +41,16 @@
    <link rel="prev" title="Proximal Policy Optimization" href="../policy_optimization/ppo.html" />
    <link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">

-
-  
-  <script src="../../../_static/js/modernizr.min.js"></script>
-
 </head>

 <body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
-
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
-        <div class="wy-side-nav-search">
+        <div class="wy-side-nav-search" >
          

          
@@ -226,19 +229,18 @@
 <h2>Algorithm Description<a class="headerlink" href="#algorithm-description" title="Permalink to this headline">¶</a></h2>
 <p>Rainbow combines 6 recent advancements in reinforcement learning:</p>
 <ul class="simple">
-<li>N-step returns</li>
-<li>Distributional state-action value learning</li>
-<li>Dueling networks</li>
-<li>Noisy Networks</li>
-<li>Double DQN</li>
-<li>Prioritized Experience Replay</li>
+<li><p>N-step returns</p></li>
+<li><p>Distributional state-action value learning</p></li>
+<li><p>Dueling networks</p></li>
+<li><p>Noisy Networks</p></li>
+<li><p>Double DQN</p></li>
+<li><p>Prioritized Experience Replay</p></li>
 </ul>
 <div class="section" id="training-the-network">
 <h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
 <ol class="arabic">
-<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
-</li>
-<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
+<li><p>Sample a batch of transitions from the replay buffer.</p></li>
+<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
 that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
 <p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
 <p>where:
@@ -246,36 +248,29 @@ that the <span class="math notranslate nohighlight">\(i-th\)</span> component of
 *  <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom
 <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r_t+\gamma r_{t+1} + ... + \gamma r_{t+n-1} + \gamma^{n-1} z_j\)</span></p>
 </li>
-<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
-probability distribution.   Only the target of the actions that were actually taken is updated.</p>
-</li>
-<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
-</li>
-<li><p class="first">After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
-using the KL divergence loss that is returned from the network.</p>
-</li>
+<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
+probability distribution.   Only the target of the actions that were actually taken is updated.</p></li>
+<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
+<li><p>After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
+using the KL divergence loss that is returned from the network.</p></li>
 </ol>
 <dl class="class">
 <dt id="rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters">
 <em class="property">class </em><code class="descclassname">rl_coach.agents.rainbow_dqn_agent.</code><code class="descname">RainbowDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/rainbow_dqn_agent.html#RainbowDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
-<dd><table class="docutils field-list" frame="void" rules="none">
-<col class="field-name" />
-<col class="field-body" />
-<tbody valign="top">
-<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
-<li><strong>n_step</strong> – (int)
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>n_step</strong> – (int)
 The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
 using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
-prediction.</li>
-<li><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
+prediction.</p></li>
+<li><p><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
 If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
 written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
-transitions into the memory, and to do so we need the entire episode first.</li>
+transitions into the memory, and to do so we need the entire episode first.</p></li>
 </ul>
-</td>
-</tr>
-</tbody>
-</table>
+</dd>
+</dl>
 </dd></dl>

 </div>
@@ -293,7 +288,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
        <a href="qr_dqn.html" class="btn btn-neutral float-right" title="Quantile Regression DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
-        <a href="../policy_optimization/ppo.html" class="btn btn-neutral" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../policy_optimization/ppo.html" class="btn btn-neutral float-left" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
@@ -302,7 +297,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>

  <div role="contentinfo">
    <p>
-        &copy; Copyright 2018, Intel AI Lab
+        &copy; Copyright 2018-2019, Intel AI Lab

    </p>
  </div>
@@ -319,27 +314,16 @@ transitions into the memory, and to do so we need the entire episode first.</li>
  


-  
-
-    
-    
-      <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
-        <script type="text/javascript" src="../../../_static/jquery.js"></script>
-        <script type="text/javascript" src="../../../_static/underscore.js"></script>
-        <script type="text/javascript" src="../../../_static/doctools.js"></script>
-        <script type="text/javascript" src="../../../_static/language_data.js"></script>
-        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
-    
-
-  
-
-  <script type="text/javascript" src="../../../_static/js/theme.js"></script>
-
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
-  </script> 
+  </script>
+
+  
+  
+    
+   

 </body>
 </html>