mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
moving the docs to github
This commit is contained in:
343
docs/algorithms/policy_optimization/ac/index.html
Normal file
343
docs/algorithms/policy_optimization/ac/index.html
Normal file
@@ -0,0 +1,343 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Actor-Critic - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Actor-Critic";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Actor-Critic</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#actor-critic">Actor-Critic</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Actor-Critic</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="actor-critic">Actor-Critic</h1>
|
||||
<p><strong>Actions space:</strong> Discrete|Continuous</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1602.01783">Asynchronous Methods for Deep Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p><p style="text-align: center;">
|
||||
<img src="..\..\design_imgs\ac.png" width=500>
|
||||
</p></p>
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action-discrete-actions">Choosing an action - Discrete actions</h3>
|
||||
<p>The policy network is used in order to predict action probabilites. While training, a sample is taken from a categorical distribution assigned with these probabilities. When testing, the action with the highest probability is used.</p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>A batch of <script type="math/tex"> T_{max} </script> transitions is used, and the advantages are calculated upon it.</p>
|
||||
<p>Advantages can be calculated by either of the following methods (configured by the selected preset) -</p>
|
||||
<ol>
|
||||
<li><strong>A_VALUE</strong> - Estimating advantage directly:<script type="math/tex; mode=display"> A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t) </script>where <script type="math/tex">k</script> is <script type="math/tex">T_{max} - State\_Index</script> for each state in the batch.</li>
|
||||
<li><strong>GAE</strong> - By following the <a href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper. </li>
|
||||
</ol>
|
||||
<p>The advantages are then used in order to accumulate gradients according to
|
||||
<script type="math/tex; mode=display"> L = -\mathop{\mathbb{E}} [log (\pi) \cdot A] </script>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../ddpg/index.html" class="btn btn-neutral float-right" title="Deep Determinstic Policy Gradients"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../pg/index.html" class="btn btn-neutral" title="Policy Gradient"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../pg/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../ddpg/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
353
docs/algorithms/policy_optimization/cppo/index.html
Normal file
353
docs/algorithms/policy_optimization/cppo/index.html
Normal file
@@ -0,0 +1,353 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Clipped Proximal Policy Optimization - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Clipped Proximal Policy Optimization";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#clipped-proximal-policy-optimization">Clipped Proximal Policy Optimization</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Clipped Proximal Policy Optimization</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="clipped-proximal-policy-optimization">Clipped Proximal Policy Optimization</h1>
|
||||
<p><strong>Actions space:</strong> Discrete|Continuous</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/pdf/1707.06347.pdf">Proximal Policy Optimization Algorithms</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
<img src="..\..\design_imgs\ppo.png">
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action-continuous-action">Choosing an action - Continuous action</h3>
|
||||
<p>Same as in PPO. </p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>Very similar to PPO, with several small (but very simplifying) changes:</p>
|
||||
<ol>
|
||||
<li>
|
||||
<p>Train both the value and policy networks, simultaneously, by defining a single loss function, which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>The unified network's optimizer is set to Adam (instead of L-BFGS for the value network as in PPO). </p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Value targets are now also calculated based on the GAE advantages. In this method, the <script type="math/tex"> V </script> values are predicted from the critic network, and then added to the GAE based advantages, in order to get a <script type="math/tex"> Q </script> value for each action. Now, since our critic network is predicting a <script type="math/tex"> V </script> value for each state, setting the <script type="math/tex"> Q </script> calculated action-values as a target, will on average serve as a <script type="math/tex"> V </script> state-value target. </p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio <script type="math/tex">r_t(\theta) =\frac{\pi_{\theta}(a|s)}{\pi_{\theta_{old}}(a|s)}</script> is clipped, to achieve a similar effect. This is done by defining the policy's loss function to be the minimum between the standard surrogate loss and an epsilon clipped surrogate loss:</p>
|
||||
</li>
|
||||
</ol>
|
||||
<p>
|
||||
<script type="math/tex; mode=display">L^{CLIP}(\theta)=E_{t}[min(r_t(\theta)\cdot \hat{A}_t, clip(r_t(\theta), 1-\epsilon, 1+\epsilon) \cdot \hat{A}_t)] </script>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../../other/dfp/index.html" class="btn btn-neutral float-right" title="Direct Future Prediction"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../ppo/index.html" class="btn btn-neutral" title="Proximal Policy Optimization"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../ppo/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../../other/dfp/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
351
docs/algorithms/policy_optimization/ddpg/index.html
Normal file
351
docs/algorithms/policy_optimization/ddpg/index.html
Normal file
@@ -0,0 +1,351 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Deep Determinstic Policy Gradients - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Deep Determinstic Policy Gradients";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#deep-deterministic-policy-gradient">Deep Deterministic Policy Gradient</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Deep Determinstic Policy Gradients</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="deep-deterministic-policy-gradient">Deep Deterministic Policy Gradient</h1>
|
||||
<p><strong>Actions space:</strong> Continuous</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1509.02971">Continuous control with deep reinforcement learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\ddpg.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action">Choosing an action</h3>
|
||||
<p>Pass the current states through the actor network, and get an action mean vector <script type="math/tex"> \mu </script>. While in training phase, use a continuous exploration policy, such as the Ornstein-Uhlenbeck process, to add exploration noise to the action. When testing, use the mean vector <script type="math/tex">\mu</script> as-is.</p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>Start by sampling a batch of transitions from the experience replay.</p>
|
||||
<ul>
|
||||
<li>To train the <strong>critic network</strong>, use the following targets:</li>
|
||||
</ul>
|
||||
<p>
|
||||
<script type="math/tex; mode=display"> y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} )) </script>
|
||||
First run the actor target network, using the next states as the inputs, and get <script type="math/tex"> \mu (s_{t+1} ) </script>. Next, run the critic target network using the next states and <script type="math/tex"> \mu (s_{t+1} ) </script>, and use the output to calculate <script type="math/tex"> y_t </script> according to the equation above. To train the network, use the current states and actions as the inputs, and <script type="math/tex">y_t</script> as the targets.</p>
|
||||
<ul>
|
||||
<li>To train the <strong>actor network</strong>, use the following equation:</li>
|
||||
</ul>
|
||||
<p>
|
||||
<script type="math/tex; mode=display"> \nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ] </script>
|
||||
Use the actor's online network to get the action mean values using the current states as the inputs. Then, use the critic online network in order to get the gradients of the critic output with respect to the action mean values <script type="math/tex"> \nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) } </script>. Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights, given <script type="math/tex"> \nabla_a Q(s,a) </script>. Finally, apply those gradients to the actor network.</p>
|
||||
<p>After every training step, do a soft update of the critic and actor target networks' weights from the online networks.</p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../ppo/index.html" class="btn btn-neutral float-right" title="Proximal Policy Optimization"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../ac/index.html" class="btn btn-neutral" title="Actor-Critic"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../ac/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../ppo/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
343
docs/algorithms/policy_optimization/pg/index.html
Normal file
343
docs/algorithms/policy_optimization/pg/index.html
Normal file
@@ -0,0 +1,343 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Policy Gradient - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Policy Gradient";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Policy Gradient</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#policy-gradient">Policy Gradient</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ppo/index.html">Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Policy Gradient</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="policy-gradient">Policy Gradient</h1>
|
||||
<p><strong>Actions space:</strong> Discrete|Continuous</p>
|
||||
<p><strong>References:</strong> <a href="http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf">Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\pg.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action-discrete-actions">Choosing an action - Discrete actions</h3>
|
||||
<p>Run the current states through the network and get a policy distribution over the actions. While training, sample from the policy distribution. When testing, take the action with the highest probability. </p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<p>The policy head loss is defined as <script type="math/tex"> L=-log (\pi) \cdot PolicyGradientRescaler </script>. The <script type="math/tex">PolicyGradientRescaler</script> is used in order to reduce the policy gradient variance, which might be very noisy. This is done in order to reduce the variance of the updates, since noisy gradient updates might destabilize the policy's convergence. The rescaler is a configurable parameter and there are few options to choose from: <br />
|
||||
<em> <strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.
|
||||
</em> <strong>Future Return</strong> - Return from each transition until the end of the episode.
|
||||
<em> <strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode's mean and standard deviation.
|
||||
</em> <strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations, which are calculated seperately for each timestep, across different episodes. </p>
|
||||
<p>Gradients are accumulated over a number of full played episodes. The gradients accumulation over several episodes serves the same purpose - reducing the update variance. After accumulating gradients for several episodes, the gradients are then applied to the network. </p>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../ac/index.html" class="btn btn-neutral float-right" title="Actor-Critic"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../../value_optimization/naf/index.html" class="btn btn-neutral" title="Normalized Advantage Functions"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../../value_optimization/naf/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../ac/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
344
docs/algorithms/policy_optimization/ppo/index.html
Normal file
344
docs/algorithms/policy_optimization/ppo/index.html
Normal file
@@ -0,0 +1,344 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Proximal Policy Optimization - Reinforcement Learning Coach Documentation</title>
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
||||
|
||||
|
||||
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
||||
|
||||
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../css/highlight.css">
|
||||
<link href="../../../extra.css" rel="stylesheet">
|
||||
|
||||
|
||||
<script>
|
||||
// Current page data
|
||||
var mkdocs_page_name = "Proximal Policy Optimization";
|
||||
</script>
|
||||
|
||||
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
||||
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
||||
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
||||
<script src="../../../js/theme.js"></script>
|
||||
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav" role="document">
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
||||
<div class="wy-side-nav-search">
|
||||
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach Documentation</a>
|
||||
<div role="search">
|
||||
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||||
<input type="text" name="q" placeholder="Search docs" />
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||||
<ul class="current">
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../..">Home</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../design/index.html">Design</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../usage/index.html">Usage</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Algorithms</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dqn/index.html">DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/double_dqn/index.html">Double DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/dueling_dqn/index.html">Dueling DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/categorical_dqn/index.html">Categorical DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/mmc/index.html">Mixed Monte Carlo</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/pal/index.html">Persistent Advantage Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/nec/index.html">Neural Episodic Control</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/bs_dqn/index.html">Bootstrapped DQN</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/n_step/index.html">N-Step Q Learning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../value_optimization/naf/index.html">Normalized Advantage Functions</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../pg/index.html">Policy Gradient</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ac/index.html">Actor-Critic</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../ddpg/index.html">Deep Determinstic Policy Gradients</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 current">
|
||||
<a class="current" href="./index.html">Proximal Policy Optimization</a>
|
||||
|
||||
<ul>
|
||||
|
||||
<li class="toctree-l3"><a href="#proximal-policy-optimization">Proximal Policy Optimization</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
||||
|
||||
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../cppo/index.html">Clipped Proximal Policy Optimization</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../other/dfp/index.html">Direct Future Prediction</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../imitation/bc/index.html">Behavioral Cloning</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../dashboard/index.html">Coach Dashboard</a>
|
||||
|
||||
</li>
|
||||
<li>
|
||||
|
||||
<li>
|
||||
<ul class="subnav">
|
||||
<li><span>Contributing</span></li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_agent/index.html">Adding a New Agent</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li class="toctree-l1 ">
|
||||
<a class="" href="../../../contributing/add_env/index.html">Adding a New Environment</a>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
<li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||||
|
||||
|
||||
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
||||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||||
<a href="../../..">Reinforcement Learning Coach Documentation</a>
|
||||
</nav>
|
||||
|
||||
|
||||
<div class="wy-nav-content">
|
||||
<div class="rst-content">
|
||||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||||
<ul class="wy-breadcrumbs">
|
||||
<li><a href="../../..">Docs</a> »</li>
|
||||
|
||||
|
||||
|
||||
<li>Algorithms »</li>
|
||||
|
||||
|
||||
|
||||
<li>Proximal Policy Optimization</li>
|
||||
<li class="wy-breadcrumbs-aside">
|
||||
|
||||
</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
</div>
|
||||
<div role="main">
|
||||
<div class="section">
|
||||
|
||||
<h1 id="proximal-policy-optimization">Proximal Policy Optimization</h1>
|
||||
<p><strong>Actions space:</strong> Discrete|Continuous</p>
|
||||
<p><strong>References:</strong> <a href="https://arxiv.org/pdf/1707.06347.pdf">Proximal Policy Optimization Algorithms</a></p>
|
||||
<h2 id="network-structure">Network Structure</h2>
|
||||
<p style="text-align: center;">
|
||||
|
||||
<img src="..\..\design_imgs\ppo.png">
|
||||
|
||||
</p>
|
||||
|
||||
<h2 id="algorithm-description">Algorithm Description</h2>
|
||||
<h3 id="choosing-an-action-continuous-actions">Choosing an action - Continuous actions</h3>
|
||||
<p>Run the observation through the policy network, and get the mean and standard deviation vectors for this observation. While in training phase, sample from a multi-dimensional Gaussian distribution with these mean and standard deviation values. When testing, just take the mean values predicted by the network. </p>
|
||||
<h3 id="training-the-network">Training the network</h3>
|
||||
<ol>
|
||||
<li>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</li>
|
||||
<li>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman '2015). </li>
|
||||
<li>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers, the L-BFGS optimizer runs on the entire dataset at once, without batching. It continues running until some low loss threshold is reached. To prevent overfitting to the current dataset, the value targets are updated in a soft manner, using an Exponentially Weighted Moving Average, based on the total discounted returns of each state in each episode.</li>
|
||||
<li>Run several training iterations of the policy network. This is done by using the previously calculated advantages as targets. The loss function penalizes policies that deviate too far from the old policy (the policy that was used <em>before</em> starting to run the current set of training iterations) using a regularization term. </li>
|
||||
<li>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value, in order to adapt the penalty coefficient used in the policy loss. If the KL divergence went too high, increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged. </li>
|
||||
</ol>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
<a href="../cppo/index.html" class="btn btn-neutral float-right" title="Clipped Proximal Policy Optimization"/>Next <span class="icon icon-circle-arrow-right"></span></a>
|
||||
|
||||
|
||||
<a href="../ddpg/index.html" class="btn btn-neutral" title="Deep Determinstic Policy Gradients"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<!-- Copyright etc -->
|
||||
|
||||
</div>
|
||||
|
||||
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</section>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="rst-versions" role="note" style="cursor: pointer">
|
||||
<span class="rst-current-version" data-toggle="rst-current-version">
|
||||
|
||||
|
||||
<span><a href="../ddpg/index.html" style="color: #fcfcfc;">« Previous</a></span>
|
||||
|
||||
|
||||
<span style="margin-left: 15px"><a href="../cppo/index.html" style="color: #fcfcfc">Next »</a></span>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user