mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
300 lines
11 KiB
HTML
300 lines
11 KiB
HTML
<!DOCTYPE html>
|
|
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
|
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
|
|
<link rel="shortcut icon" href="../../../img/favicon.ico">
|
|
<title>Direct Future Prediction - Reinforcement Learning Coach</title>
|
|
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
|
|
|
|
<link rel="stylesheet" href="../../../css/theme.css" type="text/css" />
|
|
<link rel="stylesheet" href="../../../css/theme_extra.css" type="text/css" />
|
|
<link rel="stylesheet" href="../../../css/highlight.css">
|
|
<link href="../../../extra.css" rel="stylesheet">
|
|
|
|
<script>
|
|
// Current page data
|
|
var mkdocs_page_name = "Direct Future Prediction";
|
|
var mkdocs_page_input_path = "algorithms/other/dfp.md";
|
|
var mkdocs_page_url = "/algorithms/other/dfp/";
|
|
</script>
|
|
|
|
<script src="../../../js/jquery-2.1.1.min.js"></script>
|
|
<script src="../../../js/modernizr-2.8.3.min.js"></script>
|
|
<script type="text/javascript" src="../../../js/highlight.pack.js"></script>
|
|
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav" role="document">
|
|
|
|
<div class="wy-grid-for-nav">
|
|
|
|
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
|
|
<div class="wy-side-nav-search">
|
|
<a href="../../.." class="icon icon-home"> Reinforcement Learning Coach</a>
|
|
<div role="search">
|
|
<form id ="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" />
|
|
</form>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
|
<ul class="current">
|
|
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<a class="" href="../../..">Home</a>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<a class="" href="../../../usage/">Usage</a>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<span class="caption-text">Design</span>
|
|
<ul class="subnav">
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/features/">Features</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/control_flow/">Control Flow</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/network/">Network</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../design/filters/">Filters</a>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<span class="caption-text">Algorithms</span>
|
|
<ul class="subnav">
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/dqn/">DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/double_dqn/">Double DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/dueling_dqn/">Dueling DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/categorical_dqn/">Categorical DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/mmc/">Mixed Monte Carlo</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/pal/">Persistent Advantage Learning</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/nec/">Neural Episodic Control</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/bs_dqn/">Bootstrapped DQN</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/n_step/">N-Step Q Learning</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../value_optimization/naf/">Normalized Advantage Functions</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../policy_optimization/pg/">Policy Gradient</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../policy_optimization/ac/">Actor-Critic</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../policy_optimization/ddpg/">Deep Determinstic Policy Gradients</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../policy_optimization/ppo/">Proximal Policy Optimization</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../policy_optimization/cppo/">Clipped Proximal Policy Optimization</a>
|
|
</li>
|
|
<li class=" current">
|
|
|
|
<a class="current" href="./">Direct Future Prediction</a>
|
|
<ul class="subnav">
|
|
|
|
<li class="toctree-l3"><a href="#direct-future-prediction">Direct Future Prediction</a></li>
|
|
|
|
<ul>
|
|
|
|
<li><a class="toctree-l4" href="#network-structure">Network Structure</a></li>
|
|
|
|
<li><a class="toctree-l4" href="#algorithm-description">Algorithm Description</a></li>
|
|
|
|
</ul>
|
|
|
|
|
|
</ul>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../imitation/bc/">Behavioral Cloning</a>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<a class="" href="../../../dashboard/">Coach Dashboard</a>
|
|
</li>
|
|
|
|
<li class="toctree-l1">
|
|
|
|
<span class="caption-text">Contributing</span>
|
|
<ul class="subnav">
|
|
<li class="">
|
|
|
|
<a class="" href="../../../contributing/add_agent/">Adding a New Agent</a>
|
|
</li>
|
|
<li class="">
|
|
|
|
<a class="" href="../../../contributing/add_env/">Adding a New Environment</a>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
</ul>
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
|
|
|
|
|
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="../../..">Reinforcement Learning Coach</a>
|
|
</nav>
|
|
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="breadcrumbs navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="../../..">Docs</a> »</li>
|
|
|
|
|
|
|
|
<li>Algorithms »</li>
|
|
|
|
|
|
|
|
<li>Direct Future Prediction</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
</div>
|
|
<div role="main">
|
|
<div class="section">
|
|
|
|
<h1 id="direct-future-prediction">Direct Future Prediction</h1>
|
|
<p><strong>Actions space:</strong> Discrete</p>
|
|
<p><strong>References:</strong> <a href="https://arxiv.org/abs/1611.01779">Learning to Act by Predicting the Future</a></p>
|
|
<h2 id="network-structure">Network Structure</h2>
|
|
<p style="text-align: center;">
|
|
|
|
<img src="../../design_imgs/dfp.png" width=600>
|
|
|
|
</p>
|
|
|
|
<h2 id="algorithm-description">Algorithm Description</h2>
|
|
<h3 id="choosing-an-action">Choosing an action</h3>
|
|
<ol>
|
|
<li>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network. The output of the network is the predicted future measurements for time-steps <script type="math/tex">t+1,t+2,t+4,t+8,t+16</script> and <script type="math/tex">t+32</script> for each possible action. </li>
|
|
<li>For each action, the measurements of each predicted time-step are multiplied by the goal vector, and the result is a single vector of future values for each action. </li>
|
|
<li>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action. </li>
|
|
<li>The action values are passed to the exploration policy to decide on the action to use.</li>
|
|
</ol>
|
|
<h3 id="training-the-network">Training the network</h3>
|
|
<p>Given a batch of transitions, run them through the network to get the current predictions of the future measurements per action, and set them as the initial targets for training the network. For each transition <script type="math/tex">(s_t,a_t,r_t,s_{t+1} )</script> in the batch, the target of the network for the action that was taken, is the actual measurements that were seen in time-steps <script type="math/tex">t+1,t+2,t+4,t+8,t+16</script> and <script type="math/tex">t+32</script>. For the actions that were not taken, the targets are the current values.</p>
|
|
|
|
</div>
|
|
</div>
|
|
<footer>
|
|
|
|
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
|
|
|
<a href="../../imitation/bc/" class="btn btn-neutral float-right" title="Behavioral Cloning">Next <span class="icon icon-circle-arrow-right"></span></a>
|
|
|
|
|
|
<a href="../../policy_optimization/cppo/" class="btn btn-neutral" title="Clipped Proximal Policy Optimization"><span class="icon icon-circle-arrow-left"></span> Previous</a>
|
|
|
|
</div>
|
|
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<!-- Copyright etc -->
|
|
|
|
</div>
|
|
|
|
Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
|
</footer>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</section>
|
|
|
|
</div>
|
|
|
|
<div class="rst-versions" role="note" style="cursor: pointer">
|
|
<span class="rst-current-version" data-toggle="rst-current-version">
|
|
|
|
|
|
<span><a href="../../policy_optimization/cppo/" style="color: #fcfcfc;">« Previous</a></span>
|
|
|
|
|
|
<span style="margin-left: 15px"><a href="../../imitation/bc/" style="color: #fcfcfc">Next »</a></span>
|
|
|
|
</span>
|
|
</div>
|
|
<script>var base_url = '../../..';</script>
|
|
<script src="../../../js/theme.js"></script>
|
|
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
|
|
<script src="../../../search/require.js"></script>
|
|
<script src="../../../search/search.js"></script>
|
|
|
|
</body>
|
|
</html>
|