coach/docs/selecting_an_algorithm.html



<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">

  <meta name="viewport" content="width=device-width, initial-scale=1.0">

  <title>Selecting an Algorithm &mdash; Reinforcement Learning Coach 0.11.0 documentation</title>


  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="_static/css/custom.css" type="text/css" />
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Coach Dashboard" href="dashboard.html" />
    <link rel="prev" title="Benchmarks" href="features/benchmarks.html" />
    <link href="_static/css/custom.css" rel="stylesheet" type="text/css">


  <script src="_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav">


  <div class="wy-grid-for-nav">


    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search">


            <a href="index.html" class="icon icon-home"> Reinforcement Learning Coach


            <img src="_static/dark_logo.png" class="logo" alt="Logo"/>

          </a>


<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>


        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">


              <p class="caption"><span class="caption-text">Intro</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_usage.html">Usage - Distributed Coach</a></li>
<li class="toctree-l1"><a class="reference internal" href="features/index.html">Features</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Selecting an Algorithm</a></li>
<li class="toctree-l1"><a class="reference internal" href="dashboard.html">Coach Dashboard</a></li>
</ul>
<p class="caption"><span class="caption-text">Design</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="design/control_flow.html">Control Flow</a></li>
<li class="toctree-l1"><a class="reference internal" href="design/network.html">Network Design</a></li>
<li class="toctree-l1"><a class="reference internal" href="design/horizontal_scaling.html">Distributed Coach - Horizontal Scale-Out</a></li>
</ul>
<p class="caption"><span class="caption-text">Contributing</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="contributing/add_agent.html">Adding a New Agent</a></li>
<li class="toctree-l1"><a class="reference internal" href="contributing/add_env.html">Adding a New Environment</a></li>
</ul>
<p class="caption"><span class="caption-text">Components</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="components/agents/index.html">Agents</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/architectures/index.html">Architectures</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/data_stores/index.html">Data Stores</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/environments/index.html">Environments</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/exploration_policies/index.html">Exploration Policies</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/filters/index.html">Filters</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/memories/index.html">Memories</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/memory_backends/index.html">Memory Backends</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/orchestrators/index.html">Orchestrators</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/core_types.html">Core Types</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/spaces.html">Spaces</a></li>
<li class="toctree-l1"><a class="reference internal" href="components/additional_parameters.html">Additional Parameters</a></li>
</ul>


        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">


      <nav class="wy-nav-top" aria-label="top navigation">

          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">Reinforcement Learning Coach</a>

      </nav>


      <div class="wy-nav-content">

        <div class="rst-content">


<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">

      <li><a href="index.html">Docs</a> &raquo;</li>

      <li>Selecting an Algorithm</li>


      <li class="wy-breadcrumbs-aside">


            <a href="_sources/selecting_an_algorithm.rst.txt" rel="nofollow"> View page source</a>


      </li>

  </ul>


  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">

  <div class="section" id="selecting-an-algorithm">
<h1>Selecting an Algorithm<a class="headerlink" href="#selecting-an-algorithm" title="Permalink to this headline">¶</a></h1>
<p>As you probably already noticed, Coach has a lot of algorithms implemented into it:</p>
<a class="reference internal image-reference" href="_images/algorithms.png"><img alt="_images/algorithms.png" class="align-center" src="_images/algorithms.png" style="width: 800px;" /></a>
<p><strong>“ok that’s prefect, but I am trying to build a solution for my application, how do I select the right algorithm?”</strong></p>
<p>We collected some guidelines for how to choose the right algorithm for your application.
Answer the following questions to see what are the best algorithms for your task.
The algorithms are ordered by their release date in descending order.</p>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script>
$(document).ready(function() {
   // descending order of the agent badges according to their publish year
   function order_badges() {
      $(".badges-wrapper").find('.algorithm').sort(function(a, b) {
         // dataset.year is the concatenated year and month of the paper publishing date
         return b.dataset.year - a.dataset.year;
      }).appendTo($(".badges-wrapper"));
   }

   function update_algorithms_list() {
      // show all the badges
      $("input:checkbox, input:radio").each(function(){
         $('.' + this.id).show();
      });

      // remove all that don't fit the task
      $("input:checkbox").each(function(){
         if (!this.checked) {
            $('.' + this.id).hide();
         }
      });
      $("input:radio").each(function(){
         if (this.checked) {
            $('.algorithm').not('.' + this.id).hide();
         }
      });

      order_badges();
   }

   // toggle badges according to the checkbox change
   $('input:checkbox, input:radio').click(update_algorithms_list);

   update_algorithms_list();
});
</script>

<div class="bordered-container">
   <div class="questionnaire">
      What are the type of actions your task requires?
      <div style="margin-left: 12px;">
         <input type="radio" id="discrete" name="actions" checked>Discrete actions<br>
         <input type="radio" id="continuous" name="actions">Continuous actions<br>
      </div>
      <input type="checkbox" id="imitation" checked="True">Do you have expert demonstrations for your task?<br>
      <input type="checkbox" id="on-policy" checked="True">Can you collect new data for your task dynamically?<br>
      <input type="checkbox" id="requires-multi-worker" checked="True">Do you have a simulator for your task?<br>
   </div>

   <br>
   <div class="badges-wrapper">
      <div class="algorithm discrete off-policy" data-year="201300">
         <span class="badge">
            <a href="components/agents/value_optimization/dqn.html">DQN</a>
            <br>
            Learns action values for discrete actions, and allows learning from a replay buffer with old experiences
         </span>
      </div>
      <div class="algorithm  discrete off-policy" data-year="201710">
         <span class="badge">
            <a href="components/agents/value_optimization/rainbow.html">Rainbow</a>
            <br>
            Combines multiple recent innovations on top of DQN for discrete controls, and achieves
            much better results on known benchmarks
         </span>
      </div>
      <div class="algorithm continuous off-policy" data-year="201712">
         <span class="badge">
            <a href="components/agents/policy_optimization/hac.html">HAC</a>
            <br>
            Works only for continuous actions, and uses hierarchy of agents to make the learning
            more simple
         </span>
      </div>
      <div class="algorithm discrete off-policy data-year="201509">
         <span class="badge">
            <a href="components/agents/value_optimization/ddqn.html">DDQN</a>
            <br>
            An improvement over DQN, which learns more accurate action values, and therefore achieves better results
            on known benchmarks
         </span>
      </div>
      <div class="algorithm discrete on-policy" data-year="201611">
         <span class="badge">
            <a href="components/agents/other/dfp.html">DFP</a>
            <br>
            Works only for discrete actions, by learning to predict the future values of a set of
            measurements from the environment, and then using a goal vector to weight the importance of each of the
            measurements
         </span>
      </div>
      <div class="algorithm discrete off-policy" data-year="201606">
         <span class="badge">
            <a href="components/agents/value_optimization/mmc.html">MMC</a>
            <br>
            A simple modification to DQN, which instead of learning action values only by bootstrapping the current
            action value prediction, it mixes in the total discounted return as well. This helps learn the correct
            action values faster, and is particularly useful for environments with delayed rewards.
         </span>
      </div>
      <div class="algorithm discrete off-policy" data-year="201512">
         <span class="badge">
            <a href="components/agents/value_optimization/pal.html">PAL</a>
            <br>
            An improvement over DQN, that tries to deal with the approximation errors present in reinforcement
            learning by increasing the gap between the value of the best action and the second best action.
         </span>
      </div>
      <div class="algorithm continuous off-policy" data-year="201603">
         <span class="badge">
            <a href="components/agents/value_optimization/naf.html">NAF</a>
            <br>
            A variant of Q learning for continuous control.
         </span>
      </div>
      <div class="algorithm discrete off-policy" data-year="201703">
         <span class="badge">
            <a href="components/agents/value_optimization/ddqn.html">NEC</a>
            <br>
            Uses a memory to "memorize" its experience and learn much faster by querying the memory on newly
            seen states.
         </span>
      </div>
      <div class="algorithm discrete off-policy" data-year="201710">
         <span class="badge">
            <a href="components/agents/value_optimization/qr_dqn.html">QR DQN</a>
            <br>
            Uses quantile regression to learn a distribution over the action values instead of only their mean.
            This boosts performance on known benchmarks.
         </span>
      </div>
      <div class="algorithm discrete off-policy" data-year="201602">
         <span class="badge">
            <a href="components/agents/value_optimization/bs_dqn.html">Bootstrapped DQN</a>
            <br>
            Uses an ensemble of DQN networks, where each network learns from a different subset of the experience
            in order to improve exploration.
         </span>
      </div>
      <div class="algorithm discrete on-policy requires-multi-worker" data-year="201602">
         <span class="badge">
            <a href="components/agents/value_optimization/n_step.html">N-Step Q Learning</a>
            <br>
            A variant of Q learning that uses bootstrapping of N steps ahead, instead of 1 step. Doing this
            makes the algorithm on-policy and therefore requires having multiple workers training in parallel in
            order for it to work well.
         </span>
      </div>
      <div class="algorithm discrete off-policy" data-year="201706">
         <span class="badge">
            <a href="components/agents/value_optimization/categorical_dqn.html">Categorical DQN</a>
            <br>
            Learns a distribution over the action values instead of only their mean. This boosts performance on
            known algorithms but requires knowing the range of possible values for the accumulated rewards before hand.
         </span>
      </div>
      <div class="algorithm continuous discrete on-policy"  data-year="199200">
         <span class="badge">
            <a href="components/agents/policy_optimization/pg.html">Policy Gradient</a>
            <br>
            Based on the REINFORCE algorithm, this algorithm learn a probability distribution over the actions.
            This is the most simple algorithm available in Coach, but also has the worse results.
         </span>
      </div>
      <div class="algorithm discrete continuous on-policy requires-multi-worker" data-year="201602">
         <span class="badge">
            <a href="components/agents/policy_optimization/ac.html">Actor Critic (A3C / A2C)</a>
            <br>
            Combines REINFORCE with a learned baseline (Critic) to improve stability of learning. It also
            introduced the parallel learning of multiple workers to speed up data collection and improve the
            learning stability and speed, both for discrete and continuous action spaces.
         </span>
      </div>
      <div class="algorithm discrete on-policy requires-multi-worker" data-year="201707">
         <span class="badge">
            <a href="components/agents/policy_optimization/acer.html">ACER</a>
            <br>
            Similar to A3C with the addition of experience replay and off-policy training. to reduce variance and
            improve stability it also employs bias correction and trust region optimization techniques.
         </span>
      </div>
      <div class="algorithm continuous off-policy" data-year="201808">
         <span class="badge">
            <a href="components/agents/policy_optimization/sac.html">SAC</a>
            <br>
            Soft Actor-Critic is an algorithm which optimizes a stochastic policy in an off-policy way.
            One of the key features of SAC is that it solves a maximum entropy reinforcement learning problem.
         </span>
      </div>
      <div class="algorithm continuous off-policy" data-year="201509">
         <span class="badge">
            <a href="components/agents/policy_optimization/ddpg.html">DDPG</a>
            <br>
            An actor critic scheme for continuous action spaces which assumes that the policy is deterministic,
            and therefore it is able to use a replay buffer in order to improve sample efficiency.
         </span>
      </div>
      <div class="algorithm continuous discrete on-policy" data-year="201706">
         <span class="badge">
            <a href="components/agents/policy_optimization/ppo.html">PPO</a>
            <br>
            An actor critic scheme which uses bounded updates to the policy in order to make the learning process
            very stable.
         </span>
      </div>
      <div class="algorithm discrete continuous on-policy" data-year="201706">
         <span class="badge">
            <a href="components/agents/policy_optimization/cppo.html">Clipped PPO</a>
            <br>
            A simplification of PPO, that reduces the code complexity while achieving similar results.
         </span>
      </div>
      <div class="algorithm discrete continuous imitation off-policy" data-year="199700">
         <span class="badge">
            <a href="components/agents/imitation/bc.html">BC</a>
            <br>
            The simplest form of imitation learning. Uses supervised learning on a dataset of expert demonstrations
            in order to imitate the expert behavior.
         </span>
      </div>
      <div class="algorithm discrete continuous imitation off-policy" data-year="201710">
         <span class="badge">
            <a href="components/agents/imitation/cil.html">CIL</a>
            <br>
            A variant of behavioral cloning, where the learned policy is disassembled to several skills
            (such as turning left or right in an intersection), and each skill is learned separately from the
            human demonstrations.
         </span>
      </div>
   </div>
</div><div class="section" id="does-your-environment-have-a-discrete-or-continuous-action-space">
<h2>1. Does your environment have a discrete or continuous action space?<a class="headerlink" href="#does-your-environment-have-a-discrete-or-continuous-action-space" title="Permalink to this headline">¶</a></h2>
<p>Some reinforcement learning algorithms work only for discrete action spaces, where the agent needs to select
one out of several possible actions. Other algorithms work only for continuous action spaces, where there are
infinite possible actions, but there is some spatial relationship between the actions. And there are some algorithms
that can be applied in both cases. The available algorithms highly depend on the task at hand.</p>
</div>
<div class="section" id="is-collecting-more-samples-from-your-environment-painful">
<h2>2. Is collecting more samples from your environment painful?<a class="headerlink" href="#is-collecting-more-samples-from-your-environment-painful" title="Permalink to this headline">¶</a></h2>
<p>Reinforcement learning algorithm are notoriously known for the amount of samples they need for training.
Typically, on-policy algorithms are much less sample efficient compared to off-policy algorithms. But there are
other algorithmic features that allow improving the sample efficiency even more, like using a DND in NEC, or using
Hindsight Experience Replay. It is hard to say which algorithm is the most sample efficient, but we can at least say
which ones are not sample efficient.</p>
</div>
<div class="section" id="do-you-have-a-simulator-that-can-be-parallelized-across-multiple-processes-or-nodes">
<h2>3. Do you have a simulator that can be parallelized across multiple processes or nodes?<a class="headerlink" href="#do-you-have-a-simulator-that-can-be-parallelized-across-multiple-processes-or-nodes" title="Permalink to this headline">¶</a></h2>
<p>Parallelizing training across multiple workers which are located on the same node or on different nodes is a technique
that has been introduced in recent years and achieved a lot of success in improving the results of multiple algorithms.
As part of this, there are some algorithms that don’t work well without being parallelized with multiple workers
working in parallel, which requires having a simulator for each worker.</p>
</div>
<div class="section" id="do-you-have-human-demonstrations-for-solving-the-task">
<h2>4. Do you have human demonstrations for solving the task?<a class="headerlink" href="#do-you-have-human-demonstrations-for-solving-the-task" title="Permalink to this headline">¶</a></h2>
<p>If human demonstrations are available for a task, most of the time it would be better to use those instead of training
using regular reinforcement learning from scratch. To use human demonstrations we have implemented several tools and
algorithms for imitation learning in Coach.</p>
</div>
</div>


           </div>

          </div>
          <footer>

    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">

        <a href="dashboard.html" class="btn btn-neutral float-right" title="Coach Dashboard" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>


        <a href="features/benchmarks.html" class="btn btn-neutral" title="Benchmarks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>

    </div>


  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2018, Intel AI Lab

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.

</footer>

        </div>
      </div>

    </section>

  </div>


      <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
        <script type="text/javascript" src="_static/jquery.js"></script>
        <script type="text/javascript" src="_static/underscore.js"></script>
        <script type="text/javascript" src="_static/doctools.js"></script>
        <script type="text/javascript" src="_static/language_data.js"></script>
        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>


  <script type="text/javascript" src="_static/js/theme.js"></script>

  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

</body>
</html>