mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
* SAC algorithm * SAC - updates to agent (learn_from_batch), sac_head and sac_q_head to fix problem in gradient calculation. Now SAC agents is able to train. gym_environment - fixing an error in access to gym.spaces * Soft Actor Critic - code cleanup * code cleanup * V-head initialization fix * SAC benchmarks * SAC Documentation * typo fix * documentation fixes * documentation and version update * README typo
514 lines
22 KiB
HTML
514 lines
22 KiB
HTML
|
||
|
||
<!DOCTYPE html>
|
||
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
||
<head>
|
||
<meta charset="utf-8">
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
|
||
<title>Selecting an Algorithm — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/css/custom.css" type="text/css" />
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="next" title="Coach Dashboard" href="dashboard.html" />
|
||
<link rel="prev" title="Benchmarks" href="features/benchmarks.html" />
|
||
<link href="_static/css/custom.css" rel="stylesheet" type="text/css">
|
||
|
||
|
||
|
||
<script src="_static/js/modernizr.min.js"></script>
|
||
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
|
||
|
||
<div class="wy-grid-for-nav">
|
||
|
||
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search">
|
||
|
||
|
||
|
||
<a href="index.html" class="icon icon-home"> Reinforcement Learning Coach
|
||
|
||
|
||
|
||
|
||
<img src="_static/dark_logo.png" class="logo" alt="Logo"/>
|
||
|
||
</a>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<p class="caption"><span class="caption-text">Intro</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="dist_usage.html">Usage - Distributed Coach</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="features/index.html">Features</a></li>
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Selecting an Algorithm</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="dashboard.html">Coach Dashboard</a></li>
|
||
</ul>
|
||
<p class="caption"><span class="caption-text">Design</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="design/control_flow.html">Control Flow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="design/network.html">Network Design</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="design/horizontal_scaling.html">Distributed Coach - Horizontal Scale-Out</a></li>
|
||
</ul>
|
||
<p class="caption"><span class="caption-text">Contributing</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="contributing/add_agent.html">Adding a New Agent</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="contributing/add_env.html">Adding a New Environment</a></li>
|
||
</ul>
|
||
<p class="caption"><span class="caption-text">Components</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/agents/index.html">Agents</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/architectures/index.html">Architectures</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/data_stores/index.html">Data Stores</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/environments/index.html">Environments</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/exploration_policies/index.html">Exploration Policies</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/filters/index.html">Filters</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/memories/index.html">Memories</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/memory_backends/index.html">Memory Backends</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/orchestrators/index.html">Orchestrators</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/core_types.html">Core Types</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/spaces.html">Spaces</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="components/additional_parameters.html">Additional Parameters</a></li>
|
||
</ul>
|
||
|
||
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
||
|
||
|
||
<nav class="wy-nav-top" aria-label="top navigation">
|
||
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="index.html">Reinforcement Learning Coach</a>
|
||
|
||
</nav>
|
||
|
||
|
||
<div class="wy-nav-content">
|
||
|
||
<div class="rst-content">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<div role="navigation" aria-label="breadcrumbs navigation">
|
||
|
||
<ul class="wy-breadcrumbs">
|
||
|
||
<li><a href="index.html">Docs</a> »</li>
|
||
|
||
<li>Selecting an Algorithm</li>
|
||
|
||
|
||
<li class="wy-breadcrumbs-aside">
|
||
|
||
|
||
<a href="_sources/selecting_an_algorithm.rst.txt" rel="nofollow"> View page source</a>
|
||
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<div class="section" id="selecting-an-algorithm">
|
||
<h1>Selecting an Algorithm<a class="headerlink" href="#selecting-an-algorithm" title="Permalink to this headline">¶</a></h1>
|
||
<p>As you probably already noticed, Coach has a lot of algorithms implemented into it:</p>
|
||
<a class="reference internal image-reference" href="_images/algorithms.png"><img alt="_images/algorithms.png" class="align-center" src="_images/algorithms.png" style="width: 800px;" /></a>
|
||
<p><strong>“ok that’s prefect, but I am trying to build a solution for my application, how do I select the right algorithm?”</strong></p>
|
||
<p>We collected some guidelines for how to choose the right algorithm for your application.
|
||
Answer the following questions to see what are the best algorithms for your task.
|
||
The algorithms are ordered by their release date in descending order.</p>
|
||
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
|
||
<script>
|
||
$(document).ready(function() {
|
||
// descending order of the agent badges according to their publish year
|
||
function order_badges() {
|
||
$(".badges-wrapper").find('.algorithm').sort(function(a, b) {
|
||
// dataset.year is the concatenated year and month of the paper publishing date
|
||
return b.dataset.year - a.dataset.year;
|
||
}).appendTo($(".badges-wrapper"));
|
||
}
|
||
|
||
function update_algorithms_list() {
|
||
// show all the badges
|
||
$("input:checkbox, input:radio").each(function(){
|
||
$('.' + this.id).show();
|
||
});
|
||
|
||
// remove all that don't fit the task
|
||
$("input:checkbox").each(function(){
|
||
if (!this.checked) {
|
||
$('.' + this.id).hide();
|
||
}
|
||
});
|
||
$("input:radio").each(function(){
|
||
if (this.checked) {
|
||
$('.algorithm').not('.' + this.id).hide();
|
||
}
|
||
});
|
||
|
||
order_badges();
|
||
}
|
||
|
||
// toggle badges according to the checkbox change
|
||
$('input:checkbox, input:radio').click(update_algorithms_list);
|
||
|
||
update_algorithms_list();
|
||
});
|
||
</script>
|
||
|
||
<div class="bordered-container">
|
||
<div class="questionnaire">
|
||
What are the type of actions your task requires?
|
||
<div style="margin-left: 12px;">
|
||
<input type="radio" id="discrete" name="actions" checked>Discrete actions<br>
|
||
<input type="radio" id="continuous" name="actions">Continuous actions<br>
|
||
</div>
|
||
<input type="checkbox" id="imitation" checked="True">Do you have expert demonstrations for your task?<br>
|
||
<input type="checkbox" id="on-policy" checked="True">Can you collect new data for your task dynamically?<br>
|
||
<input type="checkbox" id="requires-multi-worker" checked="True">Do you have a simulator for your task?<br>
|
||
</div>
|
||
|
||
<br>
|
||
<div class="badges-wrapper">
|
||
<div class="algorithm discrete off-policy" data-year="201300">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/dqn.html">DQN</a>
|
||
<br>
|
||
Learns action values for discrete actions, and allows learning from a replay buffer with old experiences
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201710">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/rainbow.html">Rainbow</a>
|
||
<br>
|
||
Combines multiple recent innovations on top of DQN for discrete controls, and achieves
|
||
much better results on known benchmarks
|
||
</span>
|
||
</div>
|
||
<div class="algorithm continuous off-policy" data-year="201712">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/hac.html">HAC</a>
|
||
<br>
|
||
Works only for continuous actions, and uses hierarchy of agents to make the learning
|
||
more simple
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy data-year="201509">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/ddqn.html">DDQN</a>
|
||
<br>
|
||
An improvement over DQN, which learns more accurate action values, and therefore achieves better results
|
||
on known benchmarks
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete on-policy" data-year="201611">
|
||
<span class="badge">
|
||
<a href="components/agents/other/dfp.html">DFP</a>
|
||
<br>
|
||
Works only for discrete actions, by learning to predict the future values of a set of
|
||
measurements from the environment, and then using a goal vector to weight the importance of each of the
|
||
measurements
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201606">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/mmc.html">MMC</a>
|
||
<br>
|
||
A simple modification to DQN, which instead of learning action values only by bootstrapping the current
|
||
action value prediction, it mixes in the total discounted return as well. This helps learn the correct
|
||
action values faster, and is particularly useful for environments with delayed rewards.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201512">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/pal.html">PAL</a>
|
||
<br>
|
||
An improvement over DQN, that tries to deal with the approximation errors present in reinforcement
|
||
learning by increasing the gap between the value of the best action and the second best action.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm continuous off-policy" data-year="201603">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/naf.html">NAF</a>
|
||
<br>
|
||
A variant of Q learning for continuous control.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201703">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/ddqn.html">NEC</a>
|
||
<br>
|
||
Uses a memory to "memorize" its experience and learn much faster by querying the memory on newly
|
||
seen states.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201710">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/qr_dqn.html">QR DQN</a>
|
||
<br>
|
||
Uses quantile regression to learn a distribution over the action values instead of only their mean.
|
||
This boosts performance on known benchmarks.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201602">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/bs_dqn.html">Bootstrapped DQN</a>
|
||
<br>
|
||
Uses an ensemble of DQN networks, where each network learns from a different subset of the experience
|
||
in order to improve exploration.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete on-policy requires-multi-worker" data-year="201602">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/n_step.html">N-Step Q Learning</a>
|
||
<br>
|
||
A variant of Q learning that uses bootstrapping of N steps ahead, instead of 1 step. Doing this
|
||
makes the algorithm on-policy and therefore requires having multiple workers training in parallel in
|
||
order for it to work well.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete off-policy" data-year="201706">
|
||
<span class="badge">
|
||
<a href="components/agents/value_optimization/categorical_dqn.html">Categorical DQN</a>
|
||
<br>
|
||
Learns a distribution over the action values instead of only their mean. This boosts performance on
|
||
known algorithms but requires knowing the range of possible values for the accumulated rewards before hand.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm continuous discrete on-policy" data-year="199200">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/pg.html">Policy Gradient</a>
|
||
<br>
|
||
Based on the REINFORCE algorithm, this algorithm learn a probability distribution over the actions.
|
||
This is the most simple algorithm available in Coach, but also has the worse results.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete continuous on-policy requires-multi-worker" data-year="201602">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/ac.html">Actor Critic (A3C / A2C)</a>
|
||
<br>
|
||
Combines REINFORCE with a learned baseline (Critic) to improve stability of learning. It also
|
||
introduced the parallel learning of multiple workers to speed up data collection and improve the
|
||
learning stability and speed, both for discrete and continuous action spaces.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete on-policy requires-multi-worker" data-year="201707">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/acer.html">ACER</a>
|
||
<br>
|
||
Similar to A3C with the addition of experience replay and off-policy training. to reduce variance and
|
||
improve stability it also employs bias correction and trust region optimization techniques.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm continuous off-policy" data-year="201808">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/sac.html">SAC</a>
|
||
<br>
|
||
Soft Actor-Critic is an algorithm which optimizes a stochastic policy in an off-policy way.
|
||
One of the key features of SAC is that it solves a maximum entropy reinforcement learning problem.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm continuous off-policy" data-year="201509">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/ddpg.html">DDPG</a>
|
||
<br>
|
||
An actor critic scheme for continuous action spaces which assumes that the policy is deterministic,
|
||
and therefore it is able to use a replay buffer in order to improve sample efficiency.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm continuous discrete on-policy" data-year="201706">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/ppo.html">PPO</a>
|
||
<br>
|
||
An actor critic scheme which uses bounded updates to the policy in order to make the learning process
|
||
very stable.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete continuous on-policy" data-year="201706">
|
||
<span class="badge">
|
||
<a href="components/agents/policy_optimization/cppo.html">Clipped PPO</a>
|
||
<br>
|
||
A simplification of PPO, that reduces the code complexity while achieving similar results.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete continuous imitation off-policy" data-year="199700">
|
||
<span class="badge">
|
||
<a href="components/agents/imitation/bc.html">BC</a>
|
||
<br>
|
||
The simplest form of imitation learning. Uses supervised learning on a dataset of expert demonstrations
|
||
in order to imitate the expert behavior.
|
||
</span>
|
||
</div>
|
||
<div class="algorithm discrete continuous imitation off-policy" data-year="201710">
|
||
<span class="badge">
|
||
<a href="components/agents/imitation/cil.html">CIL</a>
|
||
<br>
|
||
A variant of behavioral cloning, where the learned policy is disassembled to several skills
|
||
(such as turning left or right in an intersection), and each skill is learned separately from the
|
||
human demonstrations.
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div><div class="section" id="does-your-environment-have-a-discrete-or-continuous-action-space">
|
||
<h2>1. Does your environment have a discrete or continuous action space?<a class="headerlink" href="#does-your-environment-have-a-discrete-or-continuous-action-space" title="Permalink to this headline">¶</a></h2>
|
||
<p>Some reinforcement learning algorithms work only for discrete action spaces, where the agent needs to select
|
||
one out of several possible actions. Other algorithms work only for continuous action spaces, where there are
|
||
infinite possible actions, but there is some spatial relationship between the actions. And there are some algorithms
|
||
that can be applied in both cases. The available algorithms highly depend on the task at hand.</p>
|
||
</div>
|
||
<div class="section" id="is-collecting-more-samples-from-your-environment-painful">
|
||
<h2>2. Is collecting more samples from your environment painful?<a class="headerlink" href="#is-collecting-more-samples-from-your-environment-painful" title="Permalink to this headline">¶</a></h2>
|
||
<p>Reinforcement learning algorithm are notoriously known for the amount of samples they need for training.
|
||
Typically, on-policy algorithms are much less sample efficient compared to off-policy algorithms. But there are
|
||
other algorithmic features that allow improving the sample efficiency even more, like using a DND in NEC, or using
|
||
Hindsight Experience Replay. It is hard to say which algorithm is the most sample efficient, but we can at least say
|
||
which ones are not sample efficient.</p>
|
||
</div>
|
||
<div class="section" id="do-you-have-a-simulator-that-can-be-parallelized-across-multiple-processes-or-nodes">
|
||
<h2>3. Do you have a simulator that can be parallelized across multiple processes or nodes?<a class="headerlink" href="#do-you-have-a-simulator-that-can-be-parallelized-across-multiple-processes-or-nodes" title="Permalink to this headline">¶</a></h2>
|
||
<p>Parallelizing training across multiple workers which are located on the same node or on different nodes is a technique
|
||
that has been introduced in recent years and achieved a lot of success in improving the results of multiple algorithms.
|
||
As part of this, there are some algorithms that don’t work well without being parallelized with multiple workers
|
||
working in parallel, which requires having a simulator for each worker.</p>
|
||
</div>
|
||
<div class="section" id="do-you-have-human-demonstrations-for-solving-the-task">
|
||
<h2>4. Do you have human demonstrations for solving the task?<a class="headerlink" href="#do-you-have-human-demonstrations-for-solving-the-task" title="Permalink to this headline">¶</a></h2>
|
||
<p>If human demonstrations are available for a task, most of the time it would be better to use those instead of training
|
||
using regular reinforcement learning from scratch. To use human demonstrations we have implemented several tools and
|
||
algorithms for imitation learning in Coach.</p>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
</div>
|
||
<footer>
|
||
|
||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||
|
||
<a href="dashboard.html" class="btn btn-neutral float-right" title="Coach Dashboard" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||
|
||
|
||
<a href="features/benchmarks.html" class="btn btn-neutral" title="Benchmarks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||
|
||
</div>
|
||
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<p>
|
||
© Copyright 2018, Intel AI Lab
|
||
|
||
</p>
|
||
</div>
|
||
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||
|
||
</footer>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
</section>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
|
||
<script type="text/javascript" src="_static/jquery.js"></script>
|
||
<script type="text/javascript" src="_static/underscore.js"></script>
|
||
<script type="text/javascript" src="_static/doctools.js"></script>
|
||
<script type="text/javascript" src="_static/language_data.js"></script>
|
||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||
|
||
|
||
|
||
|
||
<script type="text/javascript" src="_static/js/theme.js"></script>
|
||
|
||
<script type="text/javascript">
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |