mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
moving the docs to github
This commit is contained in:
348
docs/algorithms/value_optimization/dqn/index.html
Normal file
348
docs/algorithms/value_optimization/dqn/index.html
Normal file
@@ -0,0 +1,348 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>DQN - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "DQN";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">DQN</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#deep-q-networks">Deep Q Networks</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../policy_optimization/cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>DQN</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="deep-q-networks">Deep Q Networks</h1>
|
||||
<p><strong>Actions space:</strong> Discrete</p>
|
||||
<p><strong>References:</strong> <a href="https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf">Playing Atari with Deep Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\dqn.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<ol>
|
||||
<li>Sample a batch of transitions from the replay buffer. </li>
|
||||
<li>Using the next states from the sampled batch, run the target network to calculate the <script type="math/tex"> Q </script> values for each of the actions <script type="math/tex"> Q(s_{t+1},a) </script>, and keep only the maximum value for each state. </li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), use the current states from the sampled batch, and run the online network to get the current Q values predictions. Set those values as the targets for the actions that were not actually played. </li>
|
||||
<li>
|
||||
<p>For each action that was played, use the following equation for calculating the targets of the network: <script type="math/tex; mode=display"> y_t=r(s_t,a_t)+γ\cdot max_a {Q(s_{t+1},a)} </script>
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Finally, train the online network using the current states as inputs, and with the aforementioned targets. </p>
|
||||
</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../double_dqn/index.html" class="btn btn-neutral float-right" title="Double DQN"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../../../usage/index.html" class="btn btn-neutral" title="Usage"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../../../usage/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../double_dqn/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user