mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
301 lines
11 KiB
HTML
301 lines
11 KiB
HTML
<!DOCTYPE html>
|
|
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
|
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
|
|
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
|
<title>Proximal Policy Optimization - Reinforcement Learning Coach</title>
|
|
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
|
|
|
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
|
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
|
<link rel="stylesheet" href="../../../css/highlight.css">
|
|
<link href="../../../extra.css" rel="stylesheet">
|
|
|
|
<script>
|
|
// Current page data
|
|
var mkdocs_page_name = "Proximal Policy Optimization";
|
|
var mkdocs_page_input_path = "algorithms/policy_optimization/ppo.md";
|
|
var mkdocs_page_url = "/algorithms/policy_optimization/ppo/";
|
|
</script>
|
|
|
|
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
|
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
|
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
|
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav" role="document">
|
|
|
|
<div class="wy-grid-for-nav">
|
|
|
|
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
|
<div class="wy-side-nav-search">
|
|
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach</a>
|
|
<div role="search">
|
|
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" />
|
|
</form>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
|
<ul class="current">
|
|
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<a class="" href="../../..">Home</a>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<a class="" href="../../../usage/">Usage</a>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<span class="caption-text">Design</span>
|
|
<ul class="subnav">
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/features/">Features</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/control_flow/">Control Flow</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/network/">Network</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/filters/">Filters</a>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<span class="caption-text">Algorithms</span>
|
|
<ul class="subnav">
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/dqn/">DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/double_dqn/">Double DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/dueling_dqn/">Dueling DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/categorical_dqn/">Categorical DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/mmc/">Mixed Monte Carlo</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/pal/">Persistent Advantage Learning</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/nec/">Neural Episodic Control</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/bs_dqn/">Bootstrapped DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/n_step/">N-Step Q Learning</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/naf/">Normalized Advantage Functions</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../pg/">Policy Gradient</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../ac/">Actor-Critic</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../ddpg/">Deep Determinstic Policy Gradients</a>
|
|
</li>
|
|
<li class=" current">
|
|
|
|
<a class="current" href="./">Proximal Policy Optimization</a>
|
|
<ul class="subnav">
|
|
|
|
<li class="toctree-l3"><a href="#proximal-policy-optimization">Proximal Policy Optimization</a></li>
|
|
|
|
<ul>
|
|
|
|
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
|
|
|
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
|
|
|
</ul>
|
|
|
|
|
|
</ul>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../cppo/">Clipped Proximal Policy Optimization</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../other/dfp/">Direct Future Prediction</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../imitation/bc/">Behavioral Cloning</a>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<a class="" href="../../../dashboard/">Coach Dashboard</a>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<span class="caption-text">Contributing</span>
|
|
<ul class="subnav">
|
|
<li class="">
|
|
|
|
<a class="" href="../../../contributing/add_agent/">Adding a New Agent</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../contributing/add_env/">Adding a New Environment</a>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
</ul>
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
|
|
|
|
|
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="../../..">Reinforcement Learning Coach</a>
|
|
</nav>
|
|
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="breadcrumbs navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="../../..">Docs</a> »</li>
|
|
|
|
|
|
|
|
<li>Algorithms »</li>
|
|
|
|
|
|
|
|
<li>Proximal Policy Optimization</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
</div>
|
|
<div role="main">
|
|
<div class="section">
|
|
|
|
<h1 id="proximal-policy-optimization">Proximal Policy Optimization</h1>
|
|
<p><strong>Actions space:</strong> Discrete|Continuous</p>
|
|
<p><strong>References:</strong> <a href="https://arxiv.org/pdf/1707.06347.pdf">Proximal Policy Optimization Algorithms</a></p>
|
|
<h2 id="network-structure">Network Structure</h2>
|
|
<p style="text-align: center;">
|
|
|
|
<img src="..\..\design_imgs\ppo.png">
|
|
|
|
</p>
|
|
|
|
<h2 id="algorithm-description">Algorithm Description</h2>
|
|
<h3 id="choosing-an-action-continuous-actions">Choosing an action - Continuous actions</h3>
|
|
<p>Run the observation through the policy network, and get the mean and standard deviation vectors for this observation. While in training phase, sample from a multi-dimensional Gaussian distribution with these mean and standard deviation values. When testing, just take the mean values predicted by the network. </p>
|
|
<h3 id="training-the-network">Training the network</h3>
|
|
<ol>
|
|
<li>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</li>
|
|
<li>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman '2015). </li>
|
|
<li>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers, the L-BFGS optimizer runs on the entire dataset at once, without batching. It continues running until some low loss threshold is reached. To prevent overfitting to the current dataset, the value targets are updated in a soft manner, using an Exponentially Weighted Moving Average, based on the total discounted returns of each state in each episode.</li>
|
|
<li>Run several training iterations of the policy network. This is done by using the previously calculated advantages as targets. The loss function penalizes policies that deviate too far from the old policy (the policy that was used <em>before</em> starting to run the current set of training iterations) using a regularization term. </li>
|
|
<li>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value, in order to adapt the penalty coefficient used in the policy loss. If the KL divergence went too high, increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged. </li>
|
|
</ol>
|
|
|
|
</div>
|
|
</div>
|
|
<footer>
|
|
|
|
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
|
|
|
<a href="../cppo/" class="btn btn-neutral float-right" title="Clipped Proximal Policy Optimization">Next <span class="icon icon-circle-arrow-right"></span></a>
|
|
|
|
|
|
<a href="../ddpg/" class="btn btn-neutral" title="Deep Determinstic Policy Gradients"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
|
|
|
</div>
|
|
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<!-- Copyright etc -->
|
|
|
|
</div>
|
|
|
|
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
|
</footer>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</section>
|
|
|
|
</div>
|
|
|
|
<div class="rst-versions" role="note" style="cursor: pointer">
|
|
<span class="rst-current-version" data-toggle="rst-current-version">
|
|
|
|
|
|
<span><a href="../ddpg/" style="color: #fcfcfc;">« Previous</a></span>
|
|
|
|
|
|
<span style="margin-left: 15px"><a href="../cppo/" style="color: #fcfcfc">Next »</a></span>
|
|
|
|
</span>
|
|
</div>
|
|
<script>var base_url = '../../..';</script>
|
|
<script src="../../../js/theme.js"></script>
|
|
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
|
<script src="../../../search/require.js"></script>
|
|
<script src="../../../search/search.js"></script>
|
|
|
|
</body>
|
|
</html>
|