mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
moving the docs to github
This commit is contained in:
345
docs/algorithms/value_optimization/bs_dqn/index.html
Normal file
345
docs/algorithms/value_optimization/bs_dqn/index.html
Normal file
@@ -0,0 +1,345 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Bootstrapped DQN - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Bootstrapped DQN";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Bootstrapped DQN</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#bootstrapped-dqn">Bootstrapped DQN</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Bootstrapped DQN</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="bootstrapped-dqn">Bootstrapped DQN</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1602.04621">Deep Exploration via Bootstrapped DQN</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\bs_dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action">Choosing an action</h3>
|
||||
<p>The current states are used as the input to the network. The network contains several <script type="math/tex">Q</script> heads, which are used for returning different estimations of the action <script type="math/tex"> Q </script> values. For each episode, the bootstrapped exploration policy selects a single head to play with during the episode. According to the selected head, only the relevant output <script type="math/tex"> Q </script> values are used. Using those <script type="math/tex"> Q </script> values, the exploration policy then selects the action for acting.</p>
|
||||
<h3 id="storing-the-transitions">Storing the transitions</h3>
|
||||
<p>For each transition, a Binomial mask is generated according to a predefined probability, and the number of output heads. The mask is a binary vector where each element holds a 0 for heads that shouldn't train on the specific transition, and 1 for heads that should use the transition for training. The mask is stored as part of the transition info in the replay buffer. </p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>First, sample a batch of transitions from the replay buffer. Run the current states through the network and get the current <script type="math/tex"> Q </script> value predictions for all the heads and all the actions. For each transition in the batch, and for each output head, if the transition mask is 1 - change the targets of the played action to <script type="math/tex">y_t</script>, according to the standard DQN update rule:</p>
|
||||
<p>
|
||||
<script type="math/tex; mode=display"> y_t=r(s_t,a_t )+\gamma\cdot max_a Q(s_{t+1},a) </script>
|
||||
</p>
|
||||
<p>Otherwise, leave it intact so that the transition does not affect the learning of this head. Then, train the online network according to the calculated targets.</p>
|
||||
<p>As in DQN, once in every few thousand steps, copy the weights from the online network to the target network.</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../n_step/index.html" class="btn btn-neutral float-right" title="N-Step Q Learning"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../nec/index.html" class="btn btn-neutral" title="Neural Episodic Control"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../nec/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../n_step/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
354
docs/algorithms/value_optimization/categorical_dqn/index.html
Normal file
354
docs/algorithms/value_optimization/categorical_dqn/index.html
Normal file
@@ -0,0 +1,354 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Categorical DQN - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Categorical DQN";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Categorical DQN</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#categorical-dqn">Categorical DQN</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Categorical DQN</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="categorical-dqn">Categorical DQN</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1707.06887">A Distributional Perspective on Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\distributional_dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<ol>
|
||||
<li>Sample a batch of transitions from the replay buffer. </li>
|
||||
<li>
|
||||
<p>The Bellman update is projected to the set of atoms representing the <script type="math/tex"> Q </script> values distribution, such that the <script type="math/tex">i-th</script> component of the projected update is calculated as follows:
|
||||
<script type="math/tex; mode=display"> (\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{|[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i|}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1})) </script>
|
||||
where:</p>
|
||||
<ul>
|
||||
<li>
|
||||
<script type="math/tex">[ \cdot ] </script> bounds its argument in the range [a, b]</li>
|
||||
<li>
|
||||
<script type="math/tex">\hat{T}_{z_{j}}</script> is the Bellman update for atom <script type="math/tex">z_j</script>: <script type="math/tex">\hat{T}_{z_{j}} := r+\gamma z_j</script>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
<p>Network is trained with the cross entropy loss between the resulting probability distribution and the target probability distribution. Only the target of the actions that were actually taken is updated. </p>
|
||||
</li>
|
||||
<li>Once in every few thousand steps, weights are copied from the online network to the target network.</li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../mmc/index.html" class="btn btn-neutral float-right" title="Mixed Monte Carlo"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../dueling_dqn/index.html" class="btn btn-neutral" title="Dueling DQN"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../dueling_dqn/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../mmc/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
349
docs/algorithms/value_optimization/double_dqn/index.html
Normal file
349
docs/algorithms/value_optimization/double_dqn/index.html
Normal file
@@ -0,0 +1,349 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Double DQN - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Double DQN";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Double DQN</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#double-dqn">Double DQN</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Double DQN</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="double-dqn">Double DQN</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1509.06461.pdf">Deep Reinforcement Learning with Double Q-learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<ol>
|
||||
<li>Sample a batch of transitions from the replay buffer. </li>
|
||||
<li>Using the next states from the sampled batch, run the online network in order to find the <script type="math/tex">Q</script> maximizing action <script type="math/tex">argmax_a Q(s_{t+1},a)</script>. For these actions, use the corresponding next states and run the target network to calculate <script type="math/tex">Q(s_{t+1},argmax_a Q(s_{t+1},a))</script>.</li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), use the current states from the sampled batch, and run the online network to get the current Q values predictions. Set those values as the targets for the actions that were not actually played. </li>
|
||||
<li>
|
||||
<p>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<script type="math/tex; mode=display"> y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a)) </script>
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Finally, train the online network using the current states as inputs, and with the aforementioned targets. </p>
|
||||
</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../dueling_dqn/index.html" class="btn btn-neutral float-right" title="Dueling DQN"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../dqn/index.html" class="btn btn-neutral" title="DQN"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../dqn/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../dueling_dqn/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
348
docs/algorithms/value_optimization/dqn/index.html
Normal file
348
docs/algorithms/value_optimization/dqn/index.html
Normal file
@@ -0,0 +1,348 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>DQN - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "DQN";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">DQN</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#deep-q-networks">Deep Q Networks</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>DQN</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="deep-q-networks">Deep Q Networks</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf">Playing Atari with Deep Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<ol>
|
||||
<li>Sample a batch of transitions from the replay buffer. </li>
|
||||
<li>Using the next states from the sampled batch, run the target network to calculate the <script type="math/tex"> Q </script> values for each of the actions <script type="math/tex"> Q(s_{t+1},a) </script>, and keep only the maximum value for each state. </li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), use the current states from the sampled batch, and run the online network to get the current Q values predictions. Set those values as the targets for the actions that were not actually played. </li>
|
||||
<li>
|
||||
<p>For each action that was played, use the following equation for calculating the targets of the network: <script type="math/tex; mode=display"> y_t=r(s_t,a_t)+γ\cdot max_a {Q(s_{t+1},a)} </script>
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Finally, train the online network using the current states as inputs, and with the aforementioned targets. </p>
|
||||
</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../double_dqn/index.html" class="btn btn-neutral float-right" title="Double DQN"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../../../usage/index.html" class="btn btn-neutral" title="Usage"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../../../usage/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../double_dqn/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
338
docs/algorithms/value_optimization/dueling_dqn/index.html
Normal file
338
docs/algorithms/value_optimization/dueling_dqn/index.html
Normal file
@@ -0,0 +1,338 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Dueling DQN - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Dueling DQN";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Dueling DQN</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#dueling-dqn">Dueling DQN</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#general-description">General Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Dueling DQN</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="dueling-dqn">Dueling DQN</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1511.06581">Dueling Network Architectures for Deep Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\dueling_dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="general-description">General Description</h2>
|
||||
<p>Dueling DQN presents a change in the network structure comparing to DQN.</p>
|
||||
<p>Dueling DQN uses a specialized <em>Dueling Q Head</em> in order to separate <script type="math/tex"> Q </script> to an <script type="math/tex"> A </script> (advantage) stream and a <script type="math/tex"> V </script> stream. Adding this type of structure to the network head allows the network to better differentiate actions from one another, and significantly improves the learning.</p>
|
||||
<p>In many states, the values of the different actions are very similar, and it is less important which action to take.
|
||||
This is especially important in environments where there are many actions to choose from. In DQN, on each training iteration, for each of the states in the batch, we update the <script type="math/tex">Q</script> values only for the specific actions taken in those states. This results in slower learning as we do not learn the <script type="math/tex">Q</script> values for actions that were not taken yet. On dueling architecture, on the other hand, learning is faster - as we start learning the state-value even if only a single action has been taken at this state.</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../categorical_dqn/index.html" class="btn btn-neutral float-right" title="Categorical DQN"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../double_dqn/index.html" class="btn btn-neutral" title="Double DQN"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../double_dqn/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../categorical_dqn/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
350
docs/algorithms/value_optimization/mmc/index.html
Normal file
350
docs/algorithms/value_optimization/mmc/index.html
Normal file
@@ -0,0 +1,350 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Mixed Monte Carlo - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Mixed Monte Carlo";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Mixed Monte Carlo</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#mixed-monte-carlo">Mixed Monte Carlo</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Mixed Monte Carlo</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="mixed-monte-carlo">Mixed Monte Carlo</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1703.01310">Count-Based Exploration with Neural Density Models</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="../../design_imgs/dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>In MMC, targets are calculated as a mixture between Double DQN targets and full Monte Carlo samples (total discounted returns).</p>
|
||||
<p>The DDQN targets are calculated in the same manner as in the DDQN agent:</p>
|
||||
<p>
|
||||
<script type="math/tex; mode=display"> y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a)) </script>
|
||||
</p>
|
||||
<p>The Monte Carlo targets are calculated by summing up the discounted rewards across the entire episode:</p>
|
||||
<p>
|
||||
<script type="math/tex; mode=display"> y_t^{MC}=\sum_{j=0}^T\gamma^j r(s_{t+j},a_{t+j} ) </script>
|
||||
</p>
|
||||
<p>A mixing ratio <script type="math/tex">\alpha</script> is then used to get the final targets:</p>
|
||||
<p>
|
||||
<script type="math/tex; mode=display"> y_t=(1-\alpha)\cdot y_t^{DDQN}+\alpha \cdot y_t^{MC} </script>
|
||||
</p>
|
||||
<p>Finally, the online network is trained using the current states as inputs, and the calculated targets.
|
||||
Once in every few thousand steps, copy the weights from the online network to the target network.</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../pal/index.html" class="btn btn-neutral float-right" title="Persistent Advantage Learning"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../categorical_dqn/index.html" class="btn btn-neutral" title="Categorical DQN"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../categorical_dqn/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../pal/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
349
docs/algorithms/value_optimization/n_step/index.html
Normal file
349
docs/algorithms/value_optimization/n_step/index.html
Normal file
@@ -0,0 +1,349 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>N-Step Q Learning - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "N-Step Q Learning";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">N-Step Q Learning</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#n-step-q-learning">N-Step Q Learning</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>N-Step Q Learning</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="n-step-q-learning">N-Step Q Learning</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1602.01783">Asynchronous Methods for Deep Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>The <script type="math/tex">N</script>-step Q learning algorithm works in similar manner to DQN except for the following changes:</p>
|
||||
<ol>
|
||||
<li>
|
||||
<p>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every <script type="math/tex">N</script> steps using the latest <script type="math/tex">N</script> steps played by the agent.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>In order to stabilize the learning, multiple workers work together to update the network. This creates the same effect as uncorrelating the samples used for training.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Instead of using single-step Q targets for the network, the rewards from <script type="math/tex">N</script> consequent steps are accumulated to form the <script type="math/tex">N</script>-step Q targets, according to the following equation:
|
||||
<script type="math/tex; mode=display">R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})</script>
|
||||
where <script type="math/tex">k</script> is <script type="math/tex">T_{max} - State\_Index</script> for each state in the batch</p>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../naf/index.html" class="btn btn-neutral float-right" title="Normalized Advantage Functions"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../bs_dqn/index.html" class="btn btn-neutral" title="Bootstrapped DQN"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../bs_dqn/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../naf/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
341
docs/algorithms/value_optimization/naf/index.html
Normal file
341
docs/algorithms/value_optimization/naf/index.html
Normal file
@@ -0,0 +1,341 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Normalized Advantage Functions - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Normalized Advantage Functions";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Normalized Advantage Functions</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#normalized-advantage-functions">Normalized Advantage Functions</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Normalized Advantage Functions</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="normalized-advantage-functions">Normalized Advantage Functions</h1>
|
||||
<p><strong>Actions space:</strong> Continuous</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1603.00748.pdf">Continuous Deep Q-Learning with Model-based Acceleration</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\naf.png" width=600>
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action">Choosing an action</h3>
|
||||
<p>The current state is used as an input to the network. The action mean <script type="math/tex"> \mu(s_t ) </script> is extracted from the output head. It is then passed to the exploration policy which adds noise in order to encourage exploration.</p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>The network is trained by using the following targets:
|
||||
<script type="math/tex; mode=display"> y_t=r(s_t,a_t )+\gamma\cdot V(s_{t+1}) </script>
|
||||
Use the next states as the inputs to the target network and extract the <script type="math/tex"> V </script> value, from within the head, to get <script type="math/tex"> V(s_{t+1} ) </script>. Then, update the online network using the current states and actions as inputs, and <script type="math/tex"> y_t </script> as the targets.
|
||||
After every training step, use a soft update in order to copy the weights from the online network to the target network.</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../../policy_optimization/pg/index.html" class="btn btn-neutral float-right" title="Policy Gradient"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../n_step/index.html" class="btn btn-neutral" title="N-Step Q Learning"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../n_step/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../../policy_optimization/pg/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
348
docs/algorithms/value_optimization/nec/index.html
Normal file
348
docs/algorithms/value_optimization/nec/index.html
Normal file
@@ -0,0 +1,348 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Neural Episodic Control - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Neural Episodic Control";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Neural Episodic Control</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#neural-episodic-control">Neural Episodic Control</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Neural Episodic Control</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="neural-episodic-control">Neural Episodic Control</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1703.01988">Neural Episodic Control</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\nec.png" width=500>
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action">Choosing an action</h3>
|
||||
<ol>
|
||||
<li>Use the current state as an input to the online network and extract the state embedding, which is the intermediate output from the middleware. </li>
|
||||
<li>For each possible action <script type="math/tex">a_i</script>, run the DND head using the state embedding and the selected action <script type="math/tex">a_i</script> as inputs. The DND is queried and returns the <script type="math/tex"> P </script> nearest neighbor keys and values. The keys and values are used to calculate and return the action <script type="math/tex"> Q </script> value from the network. </li>
|
||||
<li>Pass all the <script type="math/tex"> Q </script> values to the exploration policy and choose an action accordingly. </li>
|
||||
<li>Store the state embeddings and actions taken during the current episode in a small buffer <script type="math/tex">B</script>, in order to accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</li>
|
||||
</ol>
|
||||
<h3 id="finalizing-an-episode">Finalizing an episode</h3>
|
||||
<p>For each step in the episode, the state embeddings and the taken actions are stored in the buffer <script type="math/tex">B</script>. When the episode is finished, the replay buffer calculates the <script type="math/tex"> N </script>-step total return of each transition in the buffer, bootstrapped using the maximum <script type="math/tex">Q</script> value of the <script type="math/tex">N</script>-th transition. Those values are inserted along with the total return into the DND, and the buffer <script type="math/tex">B</script> is reset.</p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>Train the network only when the DND has enough entries for querying.</p>
|
||||
<p>To train the network, the current states are used as the inputs and the <script type="math/tex">N</script>-step returns are used as the targets. The <script type="math/tex">N</script>-step return used takes into account <script type="math/tex"> N </script> consecutive steps, and bootstraps the last value from the network if necessary:
|
||||
<script type="math/tex; mode=display"> y_t=\sum_{j=0}^{N-1}\gamma^j r(s_{t+j},a_{t+j} ) +\gamma^N max_a Q(s_{t+N},a) </script>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../bs_dqn/index.html" class="btn btn-neutral float-right" title="Bootstrapped DQN"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../pal/index.html" class="btn btn-neutral" title="Persistent Advantage Learning"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../pal/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../bs_dqn/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
362
docs/algorithms/value_optimization/pal/index.html
Normal file
362
docs/algorithms/value_optimization/pal/index.html
Normal file
@@ -0,0 +1,362 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Persistent Advantage Learning - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Persistent Advantage Learning";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Persistent Advantage Learning</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#persistent-advantage-learning">Persistent Advantage Learning</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Persistent Advantage Learning</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="persistent-advantage-learning">Persistent Advantage Learning</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1512.04860">Increasing the Action Gap: New Operators for Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="../../design_imgs/dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<ol>
|
||||
<li>
|
||||
<p>Sample a batch of transitions from the replay buffer. </p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Start by calculating the initial target values in the same manner as they are calculated in DDQN
|
||||
<script type="math/tex; mode=display"> y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a)) </script>
|
||||
</p>
|
||||
</li>
|
||||
<li>The action gap <script type="math/tex"> V(s_t )-Q(s_t,a_t) </script> should then be subtracted from each of the calculated targets. To calculate the action gap, run the target network using the current states and get the <script type="math/tex"> Q </script> values for all the actions. Then estimate <script type="math/tex"> V </script> as the maximum predicted <script type="math/tex"> Q </script> value for the current state:
|
||||
<script type="math/tex; mode=display"> V(s_t )=max_a Q(s_t,a) </script>
|
||||
</li>
|
||||
<li>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <script type="math/tex"> \alpha </script> from the targets <script type="math/tex"> y_t^{DDQN} </script>:
|
||||
<script type="math/tex; mode=display"> y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t )) </script>
|
||||
</li>
|
||||
<li>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action gap for the next state:
|
||||
<script type="math/tex; mode=display"> V(s_{t+1} )-Q(s_{t+1},a_{t+1}) </script>
|
||||
where <script type="math/tex"> a_{t+1} </script> is chosen by running the next states through the online network and choosing the action that has the highest predicted <script type="math/tex"> Q </script> value. Finally, the targets will be defined as -
|
||||
<script type="math/tex; mode=display"> y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} )) </script>
|
||||
</li>
|
||||
<li>
|
||||
<p>Train the online network using the current states as inputs, and with the aforementioned targets.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Once in every few thousand steps, copy the weights from the online network to the target network.</p>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../nec/index.html" class="btn btn-neutral float-right" title="Neural Episodic Control"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../mmc/index.html" class="btn btn-neutral" title="Mixed Monte Carlo"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../mmc/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../nec/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user