mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 03:30:19 +01:00
Enabling Coach Documentation to be run even when environments are not installed (#326)
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Behavioral Cloning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Behavioral Cloning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="ACER" href="../policy_optimization/acer.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -230,9 +233,9 @@ These demonstrations are given as state, action tuples, and with no reward.
|
||||
The training goal is to reduce the difference between the actions predicted by the network and the actions taken by
|
||||
the expert for each state.</p>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Use the current states as input to the network, and the expert actions as the targets of the network.</li>
|
||||
<li>For the network head, we use the policy head, which uses the cross entropy loss function.</li>
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Use the current states as input to the network, and the expert actions as the targets of the network.</p></li>
|
||||
<li><p>For the network head, we use the policy head, which uses the cross entropy loss function.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.bc_agent.BCAlgorithmParameters">
|
||||
@@ -254,7 +257,7 @@ the expert for each state.</p>
|
||||
<a href="../value_optimization/bs_dqn.html" class="btn btn-neutral float-right" title="Bootstrapped DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../policy_optimization/acer.html" class="btn btn-neutral" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../policy_optimization/acer.html" class="btn btn-neutral float-left" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -263,7 +266,7 @@ the expert for each state.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -280,27 +283,16 @@ the expert for each state.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Conditional Imitation Learning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Conditional Imitation Learning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Categorical DQN" href="../value_optimization/categorical_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -233,25 +236,22 @@ the expert for each state.
|
||||
In conditional imitation learning, each transition is assigned a class, which determines the goal that was pursuit
|
||||
in that transitions. For example, 3 possible classes can be: turn right, turn left and follow lane.</p>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
|
||||
of transitions will be sampled from each class index.</li>
|
||||
<li>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
|
||||
<li><p>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
|
||||
of transitions will be sampled from each class index.</p></li>
|
||||
<li><p>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
|
||||
corresponding to the state classes. For the other heads, set the targets to match the currently predicted values,
|
||||
so that the loss for the other heads will be zeroed out.</li>
|
||||
<li>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</li>
|
||||
so that the loss for the other heads will be zeroed out.</p></li>
|
||||
<li><p>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.cil_agent.CILAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.cil_agent.</code><code class="descname">CILAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/cil_agent.html#CILAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.cil_agent.CILAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state_key_with_the_class_index</strong> – (str)
|
||||
The key of the state dictionary which corresponds to the value that will be used to control the class index.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>state_key_with_the_class_index</strong> – (str)
|
||||
The key of the state dictionary which corresponds to the value that will be used to control the class index.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -269,7 +269,7 @@ The key of the state dictionary which corresponds to the value that will be used
|
||||
<a href="../policy_optimization/cppo.html" class="btn btn-neutral float-right" title="Clipped Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral float-left" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -278,7 +278,7 @@ The key of the state dictionary which corresponds to the value that will be used
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -295,27 +295,16 @@ The key of the state dictionary which corresponds to the value that will be used
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Agents — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Agents — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Adding a New Environment" href="../../contributing/add_env.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -241,59 +244,50 @@ A detailed description of those algorithms can be found by navigating to each of
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.AgentParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">AgentParameters</code><span class="sig-paren">(</span><em>algorithm: rl_coach.base_parameters.AlgorithmParameters, exploration: ExplorationParameters, memory: MemoryParameters, networks: Dict[str, rl_coach.base_parameters.NetworkParameters], visualization: rl_coach.base_parameters.VisualizationParameters = <rl_coach.base_parameters.VisualizationParameters object></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/base_parameters.html#AgentParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.AgentParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
|
||||
The parameters used for the specific algorithm used by the agent.
|
||||
These parameters can be later referenced in the agent implementation through self.ap.algorithm.</li>
|
||||
<li><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
|
||||
These parameters can be later referenced in the agent implementation through self.ap.algorithm.</p></li>
|
||||
<li><p><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
|
||||
space types and their corresponding ExplorationParameters. If a dictionary was used,
|
||||
when the agent will be instantiated, the correct exploration policy parameters will be used
|
||||
according to the real type of the environment action space.
|
||||
These parameters will be used to instantiate the exporation policy.</li>
|
||||
<li><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</li>
|
||||
<li><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
|
||||
These parameters will be used to instantiate the exporation policy.</p></li>
|
||||
<li><p><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</p></li>
|
||||
<li><p><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
|
||||
as a class inheriting NetworkParameters. Each element will be used in order to instantiate
|
||||
a NetworkWrapper class, and all the network wrappers will be stored in the agent under
|
||||
self.network_wrappers. self.network_wrappers is a dict mapping between the network name that
|
||||
was given in the networks dict, and the instantiated network wrapper.</li>
|
||||
<li><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
|
||||
used for visualization purposes, such as printing to the screen, rendering, and saving videos.</li>
|
||||
was given in the networks dict, and the instantiated network wrapper.</p></li>
|
||||
<li><p><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
|
||||
used for visualization purposes, such as printing to the screen, rendering, and saving videos.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.agent.Agent">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.agent.</code><code class="descname">Agent</code><span class="sig-paren">(</span><em>agent_parameters: rl_coach.base_parameters.AgentParameters</em>, <em>parent: Union[LevelManager</em>, <em>CompositeAgent] = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.act">
|
||||
<code class="descname">act</code><span class="sig-paren">(</span><em>action: Union[None</em>, <em>int</em>, <em>float</em>, <em>numpy.ndarray</em>, <em>List] = None</em><span class="sig-paren">)</span> → rl_coach.core_types.ActionInfo<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.act"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.act" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given the agents current knowledge, decide on the next action to apply to the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – An action to take, overriding whatever the current policy is</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">An ActionInfo object, which contains the action and any additional info from the action decision process</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – An action to take, overriding whatever the current policy is</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>An ActionInfo object, which contains the action and any additional info from the action decision process</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -302,21 +296,17 @@ used for visualization purposes, such as printing to the screen, rendering, and
|
||||
<dd><p>This function is a wrapper to allow having the same calls for shared or unshared memories.
|
||||
It should be used instead of calling the memory directly in order to allow different algorithms to work
|
||||
both with a shared and a local memory.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>func</strong> – the name of the memory function to call</li>
|
||||
<li><strong>args</strong> – the arguments to supply to the function</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>func</strong> – the name of the memory function to call</p></li>
|
||||
<li><p><strong>args</strong> – the arguments to supply to the function</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the return value of the function</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the return value of the function</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -324,16 +314,14 @@ both with a shared and a local memory.</p>
|
||||
<code class="descname">choose_action</code><span class="sig-paren">(</span><em>curr_state</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.choose_action"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.choose_action" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>choose an action to act with in the current episode being played. Different behavior might be exhibited when
|
||||
training or testing.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>curr_state</strong> – the current state to act upon.</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">chosen action, some action value describing the action (q-value, probability, etc)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>curr_state</strong> – the current state to act upon.</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>chosen action, some action value describing the action (q-value, probability, etc)</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -351,14 +339,11 @@ training or testing.</p>
|
||||
<dd><p>Create all the networks of the agent.
|
||||
The network creation will be done after setting the environment parameters for the agent, since they are needed
|
||||
for creating the network.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A list containing all the networks</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A list containing all the networks</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -367,37 +352,31 @@ for creating the network.</p>
|
||||
<dd><p>Get a prediction from the agent with regard to the requested prediction_type.
|
||||
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
|
||||
raise a ValueException.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>states</strong> – The states to get a prediction for</li>
|
||||
<li><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>states</strong> – The states to get a prediction for</p></li>
|
||||
<li><p><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the predicted values</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the predicted values</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.get_state_embedding">
|
||||
<code class="descname">get_state_embedding</code><span class="sig-paren">(</span><em>state: dict</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.get_state_embedding"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.get_state_embedding" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a state, get the corresponding state embedding from the main network</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – a state dict</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy embedding vector</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>state</strong> – a state dict</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy embedding vector</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -406,14 +385,11 @@ raise a ValueException.</p>
|
||||
<dd><p>Make any changes needed when each episode is ended.
|
||||
This includes incrementing counters, updating full episode dependent values, updating logs, etc.
|
||||
This function is called right after each episode is ended.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -421,44 +397,36 @@ This function is called right after each episode is ended.</p>
|
||||
<code class="descname">init_environment_dependent_modules</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.init_environment_dependent_modules"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.init_environment_dependent_modules" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Initialize any modules that depend on knowing information about the environment such as the action space or
|
||||
the observation space</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.learn_from_batch">
|
||||
<code class="descname">learn_from_batch</code><span class="sig-paren">(</span><em>batch</em><span class="sig-paren">)</span> → Tuple[float, List, List]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.learn_from_batch"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.learn_from_batch" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a batch of transitions, calculates their target values and updates the network.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>batch</strong> – A list of transitions</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The total loss of the training, the loss per head and the unclipped gradients</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>batch</strong> – A list of transitions</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>The total loss of the training, the loss per head and the unclipped gradients</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.log_to_screen">
|
||||
<code class="descname">log_to_screen</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.log_to_screen"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.log_to_screen" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Write an episode summary line to the terminal</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -467,59 +435,48 @@ the observation space</p>
|
||||
<dd><p>Given a response from the environment, distill the observation from it and store it for later use.
|
||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
||||
the reward, a game over flag and any additional information necessary.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>env_response</strong> – result of call from environment.step(action)</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a boolean value which determines if the agent has decided to terminate the episode after seeing the
|
||||
given observation</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>env_response</strong> – result of call from environment.step(action)</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a boolean value which determines if the agent has decided to terminate the episode after seeing the
|
||||
given observation</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.agents.agent.Agent.parent">
|
||||
<code class="descname">parent</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.parent" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the parent class of the agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the current phase</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>the current phase</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.agents.agent.Agent.phase">
|
||||
<code class="descname">phase</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.phase" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>The current running phase of the agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">RunPhase</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>RunPhase</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.post_training_commands">
|
||||
<code class="descname">post_training_commands</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.post_training_commands"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.post_training_commands" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A function which allows adding any functionality that is required to run right after the training phase ends.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -527,45 +484,37 @@ given observation</td>
|
||||
<code class="descname">prepare_batch_for_inference</code><span class="sig-paren">(</span><em>states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str</em><span class="sig-paren">)</span> → Dict[str, numpy.array]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.prepare_batch_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.prepare_batch_for_inference" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
observations together, measurements together, etc.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
|
||||
corresponding observation</li>
|
||||
<li><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
|
||||
the observation relevant for the network from the states.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
|
||||
corresponding observation</p></li>
|
||||
<li><p><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
|
||||
the observation relevant for the network from the states.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dictionary containing a list of values from all the given states for each of the observations</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>A dictionary containing a list of values from all the given states for each of the observations</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.register_signal">
|
||||
<code class="descname">register_signal</code><span class="sig-paren">(</span><em>signal_name: str</em>, <em>dump_one_value_per_episode: bool = True</em>, <em>dump_one_value_per_step: bool = False</em><span class="sig-paren">)</span> → rl_coach.utils.Signal<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.register_signal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.register_signal" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Register a signal such that its statistics will be dumped and be viewable through dashboard</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</li>
|
||||
<li><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</li>
|
||||
<li><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</p></li>
|
||||
<li><p><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</p></li>
|
||||
<li><p><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the created signal</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the created signal</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -574,46 +523,39 @@ the observation relevant for the network from the states.</li>
|
||||
<dd><p>Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
|
||||
evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
|
||||
by val, and by the current phase set in self.phase.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – The new phase to change to</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>val</strong> – The new phase to change to</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.reset_internal_state">
|
||||
<code class="descname">reset_internal_state</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.reset_internal_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.reset_internal_state" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Reset all the episodic parameters. This function is called right before each episode starts.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.restore_checkpoint">
|
||||
<code class="descname">restore_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_dir: str</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.restore_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.restore_checkpoint" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows agents to store additional information when saving checkpoints.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -621,51 +563,42 @@ by val, and by the current phase set in self.phase.</p>
|
||||
<code class="descname">run_off_policy_evaluation</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="headerlink" href="#rl_coach.agents.agent.Agent.run_off_policy_evaluation" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
|
||||
Should only be implemented for off-policy RL algorithms.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference">
|
||||
<code class="descname">run_pre_network_filter_for_inference</code><span class="sig-paren">(</span><em>state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True</em><span class="sig-paren">)</span> → Dict[str, numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.run_pre_network_filter_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Run filters which where defined for being applied right before using the state for inference.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>state</strong> – The state to run the filters on</li>
|
||||
<li><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>state</strong> – The state to run the filters on</p></li>
|
||||
<li><p><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">The filtered state</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>The filtered state</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.save_checkpoint">
|
||||
<code class="descname">save_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_prefix: str</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.save_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.save_checkpoint" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows agents to store additional information when saving checkpoints.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -673,16 +606,14 @@ Should only be implemented for off-policy RL algorithms.</p>
|
||||
<code class="descname">set_environment_parameters</code><span class="sig-paren">(</span><em>spaces: rl_coach.spaces.SpacesDefinition</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_environment_parameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_environment_parameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>spaces</strong> – the environment spaces definition</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>spaces</strong> – the environment spaces definition</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -692,58 +623,47 @@ dependent on those values, by calling init_environment_dependent_modules</p>
|
||||
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
|
||||
slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent
|
||||
in-action-space.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – The action that should be set as the directive</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – The action that should be set as the directive</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.set_session">
|
||||
<code class="descname">set_session</code><span class="sig-paren">(</span><em>sess</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_session" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Set the deep learning framework session for all the agents in the composite agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.setup_logger">
|
||||
<code class="descname">setup_logger</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.setup_logger"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.setup_logger" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Setup the logger for the agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.sync">
|
||||
<code class="descname">sync</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.sync"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.sync" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sync the global network parameters to local networks</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -752,14 +672,11 @@ in-action-space.</p>
|
||||
<dd><p>Check if a training phase should be done as configured by num_consecutive_playing_steps.
|
||||
If it should, then do several training steps as configured by num_consecutive_training_steps.
|
||||
A single training iteration: Sample a batch, train on it and update target networks.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The total training loss during the training iterations.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The total training loss during the training iterations.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -768,28 +685,22 @@ A single training iteration: Sample a batch, train on it and update target netwo
|
||||
<dd><p>Updates the episodic log file with all the signal values from the most recent episode.
|
||||
Additional signals for logging can be set by the creating a new signal using self.register_signal,
|
||||
and then updating it with some internal agent values.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.update_step_in_episode_log">
|
||||
<code class="descname">update_step_in_episode_log</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_step_in_episode_log"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_step_in_episode_log" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Updates the in-episode log file with all the signal values from the most recent step.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -797,16 +708,14 @@ and then updating it with some internal agent values.</p>
|
||||
<code class="descname">update_transition_before_adding_to_replay_buffer</code><span class="sig-paren">(</span><em>transition: rl_coach.core_types.Transition</em><span class="sig-paren">)</span> → rl_coach.core_types.Transition<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_transition_before_adding_to_replay_buffer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_transition_before_adding_to_replay_buffer" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition</strong> – the transition to update</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the updated transition</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>transition</strong> – the transition to update</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the updated transition</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -824,7 +733,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
<a href="policy_optimization/ac.html" class="btn btn-neutral float-right" title="Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../../contributing/add_env.html" class="btn btn-neutral" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../../contributing/add_env.html" class="btn btn-neutral float-left" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -833,7 +742,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -850,27 +759,16 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Direct Future Prediction — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Direct Future Prediction — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Soft Actor-Critic" href="../policy_optimization/sac.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -228,13 +231,13 @@
|
||||
<div class="section" id="choosing-an-action">
|
||||
<h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
|
||||
<li><p>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
|
||||
The output of the network is the predicted future measurements for time-steps <span class="math notranslate nohighlight">\(t+1,t+2,t+4,t+8,t+16\)</span> and
|
||||
<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</li>
|
||||
<li>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
|
||||
and the result is a single vector of future values for each action.</li>
|
||||
<li>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</li>
|
||||
<li>The action values are passed to the exploration policy to decide on the action to use.</li>
|
||||
<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</p></li>
|
||||
<li><p>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
|
||||
and the result is a single vector of future values for each action.</p></li>
|
||||
<li><p>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</p></li>
|
||||
<li><p>The action values are passed to the exploration policy to decide on the action to use.</p></li>
|
||||
</ol>
|
||||
</div>
|
||||
<div class="section" id="training-the-network">
|
||||
@@ -247,39 +250,35 @@ For the actions that were not taken, the targets are the current values.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.dfp_agent.DFPAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.dfp_agent.</code><code class="descname">DFPAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/dfp_agent.html#DFPAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.dfp_agent.DFPAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_predicted_steps_ahead</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_predicted_steps_ahead</strong> – (int)
|
||||
Number of future steps to predict measurements for. The future steps won’t be sequential, but rather jump
|
||||
in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4.
|
||||
The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</li>
|
||||
<li><strong>goal_vector</strong> – (List[float])
|
||||
The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</p></li>
|
||||
<li><p><strong>goal_vector</strong> – (List[float])
|
||||
The goal vector will weight each of the measurements to form an optimization goal. The vector should have
|
||||
the same length as the number of measurements, and it will be vector multiplied by the measurements.
|
||||
Positive values correspond to trying to maximize the particular measurement, and negative values
|
||||
correspond to trying to minimize the particular measurement.</li>
|
||||
<li><strong>future_measurements_weights</strong> – (List[float])
|
||||
correspond to trying to minimize the particular measurement.</p></li>
|
||||
<li><p><strong>future_measurements_weights</strong> – (List[float])
|
||||
The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
|
||||
goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
|
||||
then only the 3 last timesteps will be taken into account, according to the weights in the
|
||||
future_measurements_weights vector.</li>
|
||||
<li><strong>use_accumulated_reward_as_measurement</strong> – (bool)
|
||||
future_measurements_weights vector.</p></li>
|
||||
<li><p><strong>use_accumulated_reward_as_measurement</strong> – (bool)
|
||||
If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
|
||||
the measurements vector in the state. This van be useful in environments where the given measurements don’t
|
||||
include enough information for the particular goal the agent should achieve.</li>
|
||||
<li><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
|
||||
Dictates how to handle measurements that are outside the episode length.</li>
|
||||
<li><strong>scale_measurements_targets</strong> – (Dict[str, float])
|
||||
include enough information for the particular goal the agent should achieve.</p></li>
|
||||
<li><p><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
|
||||
Dictates how to handle measurements that are outside the episode length.</p></li>
|
||||
<li><p><strong>scale_measurements_targets</strong> – (Dict[str, float])
|
||||
Allows rescaling the values of each of the measurements available. This van be useful when the measurements
|
||||
have a different scale and you want to normalize them to the same scale.</li>
|
||||
have a different scale and you want to normalize them to the same scale.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -297,7 +296,7 @@ have a different scale and you want to normalize them to the same scale.</li>
|
||||
<a href="../value_optimization/double_dqn.html" class="btn btn-neutral float-right" title="Double DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../policy_optimization/sac.html" class="btn btn-neutral" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../policy_optimization/sac.html" class="btn btn-neutral float-left" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -306,7 +305,7 @@ have a different scale and you want to normalize them to the same scale.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -323,27 +322,16 @@ have a different scale and you want to normalize them to the same scale.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Actor-Critic — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Actor-Critic — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Agents" href="../index.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -235,41 +238,37 @@ distribution assigned with these probabilities. When testing, the action with th
|
||||
<p>A batch of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions is used, and the advantages are calculated upon it.</p>
|
||||
<p>Advantages can be calculated by either of the following methods (configured by the selected preset) -</p>
|
||||
<ol class="arabic simple">
|
||||
<li><strong>A_VALUE</strong> - Estimating advantage directly:
|
||||
<li><p><strong>A_VALUE</strong> - Estimating advantage directly:
|
||||
<span class="math notranslate nohighlight">\(A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)\)</span>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</li>
|
||||
<li><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</li>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</p></li>
|
||||
<li><p><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</p></li>
|
||||
</ol>
|
||||
<p>The advantages are then used in order to accumulate gradients according to
|
||||
<span class="math notranslate nohighlight">\(L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]\)</span></p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.actor_critic_agent.</code><code class="descname">ActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/actor_critic_agent.html#ActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
The value that will be used to rescale the policy gradient</li>
|
||||
<li><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
The value that will be used to rescale the policy gradient</p></li>
|
||||
<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
The number of episodes to wait before applying the accumulated gradients to the network.
|
||||
The training iterations only accumulate gradients without actually applying them.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.</li>
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
The training iterations only accumulate gradients without actually applying them.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.</p></li>
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
|
||||
<li><strong>gae_lambda</strong> – (float)
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
|
||||
<li><p><strong>gae_lambda</strong> – (float)
|
||||
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</li>
|
||||
<li><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.</li>
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</p></li>
|
||||
<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -287,7 +286,7 @@ If set to True, the state value targets for the V head will be estimated using t
|
||||
<a href="acer.html" class="btn btn-neutral float-right" title="ACER" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../index.html" class="btn btn-neutral" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../index.html" class="btn btn-neutral float-left" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -296,7 +295,7 @@ If set to True, the state value targets for the V head will be estimated using t
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -313,27 +312,16 @@ If set to True, the state value targets for the V head will be estimated using t
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>ACER — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>ACER — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Actor-Critic" href="ac.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -236,11 +239,11 @@ distribution assigned with these probabilities. When testing, the action with th
|
||||
and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-policy updates from batches of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions sampled from the replay buffer.</p>
|
||||
<p>Each update perform the following procedure:</p>
|
||||
<ol class="arabic">
|
||||
<li><p class="first"><strong>Calculate state values:</strong></p>
|
||||
<li><p><strong>Calculate state values:</strong></p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[V(s_t) = \mathbb{E}_{a \sim \pi} [Q(s_t,a)]\]</div>
|
||||
</li>
|
||||
<li><p class="first"><strong>Calculate Q retrace:</strong></p>
|
||||
<li><p><strong>Calculate Q retrace:</strong></p>
|
||||
<blockquote>
|
||||
<div><div class="math notranslate nohighlight">
|
||||
\[Q^{ret}(s_t,a_t) = r_t +\gamma \bar{\rho}_{t+1}[Q^{ret}(s_{t+1},a_{t+1}) - Q(s_{t+1},a_{t+1})] + \gamma V(s_{t+1})\]</div>
|
||||
@@ -248,7 +251,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
|
||||
\[\text{where} \quad \bar{\rho}_{t} = \min{\left\{c,\rho_t\right\}},\quad \rho_t=\frac{\pi (a_t \mid s_t)}{\mu (a_t \mid s_t)}\]</div>
|
||||
</div></blockquote>
|
||||
</li>
|
||||
<li><p class="first"><strong>Accumulate gradients:</strong></p>
|
||||
<li><p><strong>Accumulate gradients:</strong></p>
|
||||
<blockquote>
|
||||
<div><p><span class="math notranslate nohighlight">\(\bullet\)</span> <strong>Policy gradients (with bias correction):</strong></p>
|
||||
<blockquote>
|
||||
@@ -263,7 +266,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
|
||||
</div></blockquote>
|
||||
</div></blockquote>
|
||||
</li>
|
||||
<li><p class="first"><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
|
||||
<li><p><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
|
||||
<blockquote>
|
||||
<div><div class="math notranslate nohighlight">
|
||||
\[\hat{g}_t^{trust-region} = \hat{g}_t^{policy} - \max \left\{0, \frac{k^T \hat{g}_t^{policy} - \delta}{\lVert k \rVert_2^2}\right\} k\]</div>
|
||||
@@ -277,39 +280,35 @@ The goal of the trust region update is to the difference between the updated pol
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.acer_agent.ACERAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.acer_agent.</code><code class="descname">ACERAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/acer_agent.html#ACERAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.acer_agent.ACERAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
|
||||
<li><strong>ratio_of_replay</strong> – (int)
|
||||
The number of off-policy training iterations in each ACER iteration.</li>
|
||||
<li><strong>num_transitions_to_start_replay</strong> – (int)
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
|
||||
<li><p><strong>ratio_of_replay</strong> – (int)
|
||||
The number of off-policy training iterations in each ACER iteration.</p></li>
|
||||
<li><p><strong>num_transitions_to_start_replay</strong> – (int)
|
||||
Number of environment steps until ACER starts to train off-policy from the experience replay.
|
||||
This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
|
||||
the experience replay to start the off-policy training.</li>
|
||||
<li><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
the experience replay to start the off-policy training.</p></li>
|
||||
<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
The rate of the exponential moving average for the average policy which is used for the trust region optimization.
|
||||
The target network in this algorithm is used as the average policy.</li>
|
||||
<li><strong>importance_weight_truncation</strong> – (float)
|
||||
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</li>
|
||||
<li><strong>use_trust_region_optimization</strong> – (bool)
|
||||
The target network in this algorithm is used as the average policy.</p></li>
|
||||
<li><p><strong>importance_weight_truncation</strong> – (float)
|
||||
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</p></li>
|
||||
<li><p><strong>use_trust_region_optimization</strong> – (bool)
|
||||
If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
|
||||
the average policy and the current one, to bound the change of the policy during the network update.</li>
|
||||
<li><strong>max_KL_divergence</strong> – (float)
|
||||
the average policy and the current one, to bound the change of the policy during the network update.</p></li>
|
||||
<li><p><strong>max_KL_divergence</strong> – (float)
|
||||
The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
|
||||
for this parameter to have an effect.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
for this parameter to have an effect.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the beta value defined by beta_entropy.</li>
|
||||
is weighted using the beta value defined by beta_entropy.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -327,7 +326,7 @@ is weighted using the beta value defined by beta_entropy.</li>
|
||||
<a href="../imitation/bc.html" class="btn btn-neutral float-right" title="Behavioral Cloning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="ac.html" class="btn btn-neutral" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="ac.html" class="btn btn-neutral float-left" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -336,7 +335,7 @@ is weighted using the beta value defined by beta_entropy.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -353,27 +352,16 @@ is weighted using the beta value defined by beta_entropy.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Clipped Proximal Policy Optimization — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Clipped Proximal Policy Optimization — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Conditional Imitation Learning" href="../imitation/cil.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -233,17 +236,14 @@
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>Very similar to PPO, with several small (but very simplifying) changes:</p>
|
||||
<ol class="arabic">
|
||||
<li><p class="first">Train both the value and policy networks, simultaneously, by defining a single loss function,
|
||||
which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p>
|
||||
</li>
|
||||
<li><p class="first">The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p>
|
||||
</li>
|
||||
<li><p class="first">Value targets are now also calculated based on the GAE advantages.
|
||||
<li><p>Train both the value and policy networks, simultaneously, by defining a single loss function,
|
||||
which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p></li>
|
||||
<li><p>The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p></li>
|
||||
<li><p>Value targets are now also calculated based on the GAE advantages.
|
||||
In this method, the <span class="math notranslate nohighlight">\(V\)</span> values are predicted from the critic network, and then added to the GAE based advantages,
|
||||
in order to get a <span class="math notranslate nohighlight">\(Q\)</span> value for each action. Now, since our critic network is predicting a <span class="math notranslate nohighlight">\(V\)</span> value for
|
||||
each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p>
|
||||
</li>
|
||||
<li><p class="first">Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
|
||||
each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p></li>
|
||||
<li><p>Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
|
||||
<span class="math notranslate nohighlight">\(r_t(\theta) =\frac{\pi_{\theta}(a|s)}{\pi_{\theta_{old}}(a|s)}\)</span> is clipped, to achieve a similar effect.
|
||||
This is done by defining the policy’s loss function to be the minimum between the standard surrogate loss and an epsilon
|
||||
clipped surrogate loss:</p>
|
||||
@@ -253,46 +253,42 @@ clipped surrogate loss:</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.clipped_ppo_agent.</code><code class="descname">ClippedPPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/clipped_ppo_agent.html#ClippedPPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</li>
|
||||
<li><strong>gae_lambda</strong> – (float)
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
|
||||
<li><p><strong>gae_lambda</strong> – (float)
|
||||
The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.</li>
|
||||
<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
n-step estimations.</p></li>
|
||||
<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.</li>
|
||||
<li><strong>value_targets_mix_fraction</strong> – (float)
|
||||
implementations.</p></li>
|
||||
<li><p><strong>value_targets_mix_fraction</strong> – (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
|
||||
<li><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</li>
|
||||
<li><strong>use_kl_regularization</strong> – (bool)
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
|
||||
<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</p></li>
|
||||
<li><p><strong>use_kl_regularization</strong> – (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
policy, to bound the change of the policy during the network update.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
|
||||
<li><strong>optimization_epochs</strong> – (int)
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
|
||||
<li><p><strong>optimization_epochs</strong> – (int)
|
||||
For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
|
||||
optimization_epochs value.</li>
|
||||
<li><strong>optimization_epochs</strong> – (Schedule)
|
||||
Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
optimization_epochs value.</p></li>
|
||||
<li><p><strong>optimization_epochs</strong> – (Schedule)
|
||||
Can be used to define a schedule over the clipping of the likelihood ratio.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -310,7 +306,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
<a href="ddpg.html" class="btn btn-neutral float-right" title="Deep Deterministic Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../imitation/cil.html" class="btn btn-neutral" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../imitation/cil.html" class="btn btn-neutral float-left" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -319,7 +315,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -336,27 +332,16 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Deep Deterministic Policy Gradient — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Deep Deterministic Policy Gradient — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Clipped Proximal Policy Optimization" href="cppo.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -235,14 +238,14 @@ to add exploration noise to the action. When testing, use the mean vector <span
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>Start by sampling a batch of transitions from the experience replay.</p>
|
||||
<ul>
|
||||
<li><p class="first">To train the <strong>critic network</strong>, use the following targets:</p>
|
||||
<li><p>To train the <strong>critic network</strong>, use the following targets:</p>
|
||||
<p><span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))\)</span></p>
|
||||
<p>First run the actor target network, using the next states as the inputs, and get <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>.
|
||||
Next, run the critic target network using the next states and <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>, and use the output to
|
||||
calculate <span class="math notranslate nohighlight">\(y_t\)</span> according to the equation above. To train the network, use the current states and actions
|
||||
as the inputs, and <span class="math notranslate nohighlight">\(y_t\)</span> as the targets.</p>
|
||||
</li>
|
||||
<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<p><span class="math notranslate nohighlight">\(\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]\)</span></p>
|
||||
<p>Use the actor’s online network to get the action mean values using the current states as the inputs.
|
||||
Then, use the critic online network in order to get the gradients of the critic output with respect to the
|
||||
@@ -255,35 +258,31 @@ given <span class="math notranslate nohighlight">\(\nabla_a Q(s,a)\)</span>. Fin
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.ddpg_agent.</code><code class="descname">DDPGAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ddpg_agent.html#DDPGAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</li>
|
||||
<li><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</p></li>
|
||||
<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target</li>
|
||||
<li><strong>num_consecutive_playing_steps</strong> – (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations</li>
|
||||
<li><strong>use_target_network_for_evaluation</strong> – (bool)
|
||||
weight the new online network weights by rate_for_copying_weights_to_target</p></li>
|
||||
<li><p><strong>num_consecutive_playing_steps</strong> – (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations</p></li>
|
||||
<li><p><strong>use_target_network_for_evaluation</strong> – (bool)
|
||||
If set to True, the target network will be used for predicting the actions when choosing actions to act.
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.</li>
|
||||
<li><strong>action_penalty</strong> – (float)
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.</p></li>
|
||||
<li><p><strong>action_penalty</strong> – (float)
|
||||
The amount by which to penalize the network on high action feature (pre-activation) values.
|
||||
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
|
||||
gradients from becoming very low.</li>
|
||||
<li><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.</li>
|
||||
<li><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
|
||||
gradients from becoming very low.</p></li>
|
||||
<li><p><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.</p></li>
|
||||
<li><p><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
|
||||
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.</li>
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -301,7 +300,7 @@ values. If set to False, the terminal states reward will be taken as the target
|
||||
<a href="sac.html" class="btn btn-neutral float-right" title="Soft Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="cppo.html" class="btn btn-neutral" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="cppo.html" class="btn btn-neutral float-left" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -310,7 +309,7 @@ values. If set to False, the terminal states reward will be taken as the target
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -327,27 +326,16 @@ values. If set to False, the terminal states reward will be taken as the target
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Hierarchical Actor Critic — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Hierarchical Actor Critic — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -31,21 +39,16 @@
|
||||
<link rel="search" title="Search" href="../../../search.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -212,7 +215,7 @@ to add exploration noise to the action. When testing, use the mean vector <span
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -229,27 +232,16 @@ to add exploration noise to the action. When testing, use the mean vector <span
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Policy Gradient — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Policy Gradient — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Persistent Advantage Learning" href="../value_optimization/pal.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -237,11 +240,11 @@ The <code class="code docutils literal notranslate"><span class="pre">PolicyGrad
|
||||
This is done in order to reduce the variance of the updates, since noisy gradient updates might destabilize the policy’s
|
||||
convergence. The rescaler is a configurable parameter and there are few options to choose from:</p>
|
||||
<ul class="simple">
|
||||
<li><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</li>
|
||||
<li><strong>Future Return</strong> - Return from each transition until the end of the episode.</li>
|
||||
<li><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</li>
|
||||
<li><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
|
||||
which are calculated seperately for each timestep, across different episodes.</li>
|
||||
<li><p><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</p></li>
|
||||
<li><p><strong>Future Return</strong> - Return from each transition until the end of the episode.</p></li>
|
||||
<li><p><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</p></li>
|
||||
<li><p><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
|
||||
which are calculated seperately for each timestep, across different episodes.</p></li>
|
||||
</ul>
|
||||
<p>Gradients are accumulated over a number of full played episodes. The gradients accumulation over several episodes
|
||||
serves the same purpose - reducing the update variance. After accumulating gradients for several episodes,
|
||||
@@ -249,32 +252,28 @@ the gradients are then applied to the network.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.policy_gradients_agent.</code><code class="descname">PolicyGradientAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/policy_gradients_agent.html#PolicyGradientAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
|
||||
the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
|
||||
return, but there are other rescalers that are intended for reducing the variance of the updates.</li>
|
||||
<li><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
return, but there are other rescalers that are intended for reducing the variance of the updates.</p></li>
|
||||
<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
apply_gradients_every_x_episodes episodes.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
|
||||
will be added to the loss and scaled by the given beta factor.</li>
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
will be added to the loss and scaled by the given beta factor.</p></li>
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.</li>
|
||||
are used in the batch.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -292,7 +291,7 @@ are used in the batch.</li>
|
||||
<a href="ppo.html" class="btn btn-neutral float-right" title="Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../value_optimization/pal.html" class="btn btn-neutral" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../value_optimization/pal.html" class="btn btn-neutral float-left" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -301,7 +300,7 @@ are used in the batch.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -318,27 +317,16 @@ are used in the batch.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Proximal Policy Optimization — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Proximal Policy Optimization — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Policy Gradient" href="pg.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -234,66 +237,62 @@ When testing, just take the mean values predicted by the network.</p>
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</li>
|
||||
<li>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</li>
|
||||
<li>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
|
||||
<li><p>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</p></li>
|
||||
<li><p>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</p></li>
|
||||
<li><p>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
|
||||
the L-BFGS optimizer runs on the entire dataset at once, without batching.
|
||||
It continues running until some low loss threshold is reached. To prevent overfitting to the current dataset,
|
||||
the value targets are updated in a soft manner, using an Exponentially Weighted Moving Average, based on the total
|
||||
discounted returns of each state in each episode.</li>
|
||||
<li>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
|
||||
discounted returns of each state in each episode.</p></li>
|
||||
<li><p>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
|
||||
targets. The loss function penalizes policies that deviate too far from the old policy (the policy that was used <em>before</em>
|
||||
starting to run the current set of training iterations) using a regularization term.</li>
|
||||
<li>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
|
||||
starting to run the current set of training iterations) using a regularization term.</p></li>
|
||||
<li><p>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
|
||||
in order to adapt the penalty coefficient used in the policy loss. If the KL divergence went too high,
|
||||
increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</li>
|
||||
increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.ppo_agent.PPOAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.ppo_agent.</code><code class="descname">PPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ppo_agent.html#PPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ppo_agent.PPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</li>
|
||||
<li><strong>gae_lambda</strong> – (float)
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
|
||||
<li><p><strong>gae_lambda</strong> – (float)
|
||||
The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.</li>
|
||||
<li><strong>target_kl_divergence</strong> – (float)
|
||||
n-step estimations.</p></li>
|
||||
<li><p><strong>target_kl_divergence</strong> – (float)
|
||||
The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
|
||||
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</li>
|
||||
<li><strong>initial_kl_coefficient</strong> – (float)
|
||||
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</p></li>
|
||||
<li><p><strong>initial_kl_coefficient</strong> – (float)
|
||||
The initial weight that will be given to the KL divergence between the current and the new policy in the
|
||||
regularization factor.</li>
|
||||
<li><strong>high_kl_penalty_coefficient</strong> – (float)
|
||||
The penalty that will be given for KL divergence values which are highes than what was defined as the target.</li>
|
||||
<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
regularization factor.</p></li>
|
||||
<li><p><strong>high_kl_penalty_coefficient</strong> – (float)
|
||||
The penalty that will be given for KL divergence values which are highes than what was defined as the target.</p></li>
|
||||
<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.</li>
|
||||
<li><strong>value_targets_mix_fraction</strong> – (float)
|
||||
implementations.</p></li>
|
||||
<li><p><strong>value_targets_mix_fraction</strong> – (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
|
||||
<li><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</li>
|
||||
<li><strong>use_kl_regularization</strong> – (bool)
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
|
||||
<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</p></li>
|
||||
<li><p><strong>use_kl_regularization</strong> – (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
policy, to bound the change of the policy during the network update.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -311,7 +310,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
|
||||
<a href="../value_optimization/rainbow.html" class="btn btn-neutral float-right" title="Rainbow" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="pg.html" class="btn btn-neutral" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="pg.html" class="btn btn-neutral float-left" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -320,7 +319,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -337,27 +336,16 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Soft Actor-Critic — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Soft Actor-Critic — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Deep Deterministic Policy Gradient" href="ddpg.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -235,19 +238,19 @@ by picking the mean value or sample from a gaussian distribution like in trainin
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>Start by sampling a batch <span class="math notranslate nohighlight">\(B\)</span> of transitions from the experience replay.</p>
|
||||
<ul>
|
||||
<li><p class="first">To train the <strong>Q network</strong>, use the following targets:</p>
|
||||
<li><p>To train the <strong>Q network</strong>, use the following targets:</p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[y_t^Q=r(s_t,a_t)+\gamma \cdot V(s_{t+1})\]</div>
|
||||
<p>The state value used in the above target is acquired by running the target state value network.</p>
|
||||
</li>
|
||||
<li><p class="first">To train the <strong>State Value network</strong>, use the following targets:</p>
|
||||
<li><p>To train the <strong>State Value network</strong>, use the following targets:</p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[y_t^V = \min_{i=1,2}Q_i(s_t,\tilde{a}) - log\pi (\tilde{a} \vert s),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
|
||||
<p>The state value network is trained using a sample-based approximation of the connection between and state value and state
|
||||
action values, The actions used for constructing the target are <strong>not</strong> sampled from the replay buffer, but rather sampled
|
||||
from the current policy.</p>
|
||||
</li>
|
||||
<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[\nabla_{\theta} J \approx \nabla_{\theta} \frac{1}{\vert B \vert} \sum_{s_t\in B} \left( Q \left(s_t, \tilde{a}_\theta(s_t)\right) - log\pi_{\theta}(\tilde{a}_{\theta}(s_t)\vert s_t) \right),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
|
||||
</li>
|
||||
@@ -256,24 +259,20 @@ from the current policy.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.soft_actor_critic_agent.</code><code class="descname">SoftActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/soft_actor_critic_agent.html#SoftActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</li>
|
||||
<li><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</p></li>
|
||||
<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</li>
|
||||
<li><strong>use_deterministic_for_evaluation</strong> – (bool)
|
||||
weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</p></li>
|
||||
<li><p><strong>use_deterministic_for_evaluation</strong> – (bool)
|
||||
If True, during the evaluation phase, action are chosen deterministically according to the policy mean
|
||||
and not sampled from the policy distribution.</li>
|
||||
and not sampled from the policy distribution.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -291,7 +290,7 @@ and not sampled from the policy distribution.</li>
|
||||
<a href="../other/dfp.html" class="btn btn-neutral float-right" title="Direct Future Prediction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="ddpg.html" class="btn btn-neutral" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="ddpg.html" class="btn btn-neutral float-left" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -300,7 +299,7 @@ and not sampled from the policy distribution.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -317,27 +316,16 @@ and not sampled from the policy distribution.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Bootstrapped DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Bootstrapped DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Behavioral Cloning" href="../imitation/bc.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -265,7 +268,7 @@ Then, train the online network according to the calculated targets.</p>
|
||||
<a href="categorical_dqn.html" class="btn btn-neutral float-right" title="Categorical DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../imitation/bc.html" class="btn btn-neutral" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../imitation/bc.html" class="btn btn-neutral float-left" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -274,7 +277,7 @@ Then, train the online network according to the calculated targets.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -291,27 +294,16 @@ Then, train the online network according to the calculated targets.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Categorical DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Categorical DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Bootstrapped DQN" href="bs_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,43 +230,36 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic">
|
||||
<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
|
||||
</li>
|
||||
<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
|
||||
<p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
|
||||
<p>where:
|
||||
* <span class="math notranslate nohighlight">\([ \cdot ]\)</span> bounds its argument in the range <span class="math notranslate nohighlight">\([a, b]\)</span>
|
||||
* <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r+\gamma z_j\)</span></p>
|
||||
</li>
|
||||
<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p>
|
||||
</li>
|
||||
<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
|
||||
</li>
|
||||
<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p></li>
|
||||
<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.categorical_dqn_agent.</code><code class="descname">CategoricalDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/categorical_dqn_agent.html#CategoricalDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>v_min</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>v_min</strong> – (float)
|
||||
The minimal value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</li>
|
||||
<li><strong>v_max</strong> – (float)
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</p></li>
|
||||
<li><p><strong>v_max</strong> – (float)
|
||||
The maximum value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</li>
|
||||
<li><strong>atoms</strong> – (int)
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</p></li>
|
||||
<li><p><strong>atoms</strong> – (int)
|
||||
The number of atoms that will be used to discretize the range between v_min and v_max.
|
||||
For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
For the C51 algorithm described in the paper, the number of atoms is 51.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -281,7 +277,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
<a href="../imitation/cil.html" class="btn btn-neutral float-right" title="Conditional Imitation Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="bs_dqn.html" class="btn btn-neutral" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="bs_dqn.html" class="btn btn-neutral float-left" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -290,7 +286,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -307,27 +303,16 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Double DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Double DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Direct Future Prediction" href="../other/dfp.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,17 +230,17 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
|
||||
action <span class="math notranslate nohighlight">\(argmax_a Q(s_{t+1},a)\)</span>. For these actions, use the corresponding next states and run the target
|
||||
network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</p></li>
|
||||
<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
use the current states from the sampled batch, and run the online network to get the current Q values predictions.
|
||||
Set those values as the targets for the actions that were not actually played.</li>
|
||||
<li>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
|
||||
<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
Set those values as the targets for the actions that were not actually played.</p></li>
|
||||
<li><p>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
|
||||
<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
|
||||
<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
</div>
|
||||
</div>
|
||||
@@ -254,7 +257,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
<a href="dqn.html" class="btn btn-neutral float-right" title="Deep Q Networks" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../other/dfp.html" class="btn btn-neutral" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../other/dfp.html" class="btn btn-neutral float-left" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -263,7 +266,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -280,27 +283,16 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Deep Q Networks — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Deep Q Networks — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Double DQN" href="double_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,16 +230,16 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
|
||||
the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
|
||||
the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</p></li>
|
||||
<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
use the current states from the sampled batch, and run the online network to get the current Q values predictions.
|
||||
Set those values as the targets for the actions that were not actually played.</li>
|
||||
<li>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></li>
|
||||
<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
Set those values as the targets for the actions that were not actually played.</p></li>
|
||||
<li><p>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></p></li>
|
||||
<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
|
||||
<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.dqn_agent.DQNAlgorithmParameters">
|
||||
@@ -258,7 +261,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
<a href="dueling_dqn.html" class="btn btn-neutral float-right" title="Dueling DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="double_dqn.html" class="btn btn-neutral" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="double_dqn.html" class="btn btn-neutral float-left" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -267,7 +270,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -284,27 +287,16 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Dueling DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Dueling DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Deep Q Networks" href="dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -245,7 +248,7 @@ single action has been taken at this state.</p>
|
||||
<a href="mmc.html" class="btn btn-neutral float-right" title="Mixed Monte Carlo" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="dqn.html" class="btn btn-neutral" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="dqn.html" class="btn btn-neutral float-left" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -254,7 +257,7 @@ single action has been taken at this state.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -271,27 +274,16 @@ single action has been taken at this state.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Mixed Monte Carlo — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Mixed Monte Carlo — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Dueling DQN" href="dueling_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -238,16 +241,13 @@ Once in every few thousand steps, copy the weights from the online network to th
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.mmc_agent.</code><code class="descname">MixedMonteCarloAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/mmc_agent.html#MixedMonteCarloAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
|
||||
the single-step bootstrapped targets.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
the single-step bootstrapped targets.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -265,7 +265,7 @@ the single-step bootstrapped targets.</td>
|
||||
<a href="n_step.html" class="btn btn-neutral float-right" title="N-Step Q Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="dueling_dqn.html" class="btn btn-neutral" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="dueling_dqn.html" class="btn btn-neutral float-left" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -274,7 +274,7 @@ the single-step bootstrapped targets.</td>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -291,27 +291,16 @@ the single-step bootstrapped targets.</td>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>N-Step Q Learning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>N-Step Q Learning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Mixed Monte Carlo" href="mmc.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -228,43 +231,39 @@
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>The <span class="math notranslate nohighlight">\(N\)</span>-step Q learning algorithm works in similar manner to DQN except for the following changes:</p>
|
||||
<ol class="arabic simple">
|
||||
<li>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
|
||||
<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</li>
|
||||
<li>In order to stabilize the learning, multiple workers work together to update the network.
|
||||
This creates the same effect as uncorrelating the samples used for training.</li>
|
||||
<li>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
|
||||
<li><p>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
|
||||
<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</p></li>
|
||||
<li><p>In order to stabilize the learning, multiple workers work together to update the network.
|
||||
This creates the same effect as uncorrelating the samples used for training.</p></li>
|
||||
<li><p>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
|
||||
to form the <span class="math notranslate nohighlight">\(N\)</span>-step Q targets, according to the following equation:
|
||||
<span class="math notranslate nohighlight">\(R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})\)</span>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</li>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.n_step_q_agent.</code><code class="descname">NStepQAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/n_step_q_agent.html#NStepQAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</li>
|
||||
<li><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</p></li>
|
||||
<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.</li>
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
apply_gradients_every_x_episodes episodes.</p></li>
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.</li>
|
||||
<li><strong>targets_horizon</strong> – (str)
|
||||
are used in the batch.</p></li>
|
||||
<li><p><strong>targets_horizon</strong> – (str)
|
||||
Should be either ‘N-Step’ or ‘1-Step’, and defines the length for which to bootstrap the network values over.
|
||||
Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
|
||||
please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</li>
|
||||
please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -282,7 +281,7 @@ please refer to the original paper (<a class="reference external" href="https://
|
||||
<a href="naf.html" class="btn btn-neutral float-right" title="Normalized Advantage Functions" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="mmc.html" class="btn btn-neutral" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="mmc.html" class="btn btn-neutral float-left" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -291,7 +290,7 @@ please refer to the original paper (<a class="reference external" href="https://
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -308,27 +307,16 @@ please refer to the original paper (<a class="reference external" href="https://
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Normalized Advantage Functions — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Normalized Advantage Functions — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="N-Step Q Learning" href="n_step.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -258,7 +261,7 @@ After every training step, use a soft update in order to copy the weights from t
|
||||
<a href="nec.html" class="btn btn-neutral float-right" title="Neural Episodic Control" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="n_step.html" class="btn btn-neutral" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="n_step.html" class="btn btn-neutral float-left" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -267,7 +270,7 @@ After every training step, use a soft update in order to copy the weights from t
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -284,27 +287,16 @@ After every training step, use a soft update in order to copy the weights from t
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Neural Episodic Control — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Neural Episodic Control — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Normalized Advantage Functions" href="naf.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -229,14 +232,14 @@
|
||||
<div class="section" id="choosing-an-action">
|
||||
<h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
|
||||
output from the middleware.</li>
|
||||
<li>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
|
||||
<li><p>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
|
||||
output from the middleware.</p></li>
|
||||
<li><p>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
|
||||
The DND is queried and returns the <span class="math notranslate nohighlight">\(P\)</span> nearest neighbor keys and values. The keys and values are used to calculate
|
||||
and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</li>
|
||||
<li>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</li>
|
||||
<li>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
|
||||
accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</li>
|
||||
and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</p></li>
|
||||
<li><p>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</p></li>
|
||||
<li><p>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
|
||||
accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</p></li>
|
||||
</ol>
|
||||
</div>
|
||||
<div class="section" id="finalizing-an-episode">
|
||||
@@ -256,40 +259,36 @@ the network if necessary:
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.nec_agent.NECAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.nec_agent.</code><code class="descname">NECAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/nec_agent.html#NECAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.nec_agent.NECAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>dnd_size</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dnd_size</strong> – (int)
|
||||
Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
|
||||
of transitions that will be stored is dnd_size x num_actions.</li>
|
||||
<li><strong>l2_norm_added_delta</strong> – (float)
|
||||
of transitions that will be stored is dnd_size x num_actions.</p></li>
|
||||
<li><p><strong>l2_norm_added_delta</strong> – (float)
|
||||
A small value that will be added when calculating the weight of each of the DND entries. This follows the
|
||||
<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</li>
|
||||
<li><strong>new_value_shift_coefficient</strong> – (float)
|
||||
<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</p></li>
|
||||
<li><p><strong>new_value_shift_coefficient</strong> – (float)
|
||||
In the case where a ew embedding that was added to the DND was already present, the value that will be stored
|
||||
in the DND is a mix between the existing value and the new value. The mix rate is defined by
|
||||
new_value_shift_coefficient.</li>
|
||||
<li><strong>number_of_knn</strong> – (int)
|
||||
The number of neighbors that will be retrieved for each DND query.</li>
|
||||
<li><strong>DND_key_error_threshold</strong> – (float)
|
||||
new_value_shift_coefficient.</p></li>
|
||||
<li><p><strong>number_of_knn</strong> – (int)
|
||||
The number of neighbors that will be retrieved for each DND query.</p></li>
|
||||
<li><p><strong>DND_key_error_threshold</strong> – (float)
|
||||
When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
|
||||
exists in the DND, since exact matches of embeddings are very rare.</li>
|
||||
<li><strong>propagate_updates_to_DND</strong> – (bool)
|
||||
exists in the DND, since exact matches of embeddings are very rare.</p></li>
|
||||
<li><p><strong>propagate_updates_to_DND</strong> – (bool)
|
||||
If set to True, when the gradients of the network will be calculated, the gradients will also be
|
||||
backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
|
||||
network weights.</li>
|
||||
<li><strong>n_step</strong> – (int)
|
||||
The bootstrap length that will be used when calculating the state values to store in the DND.</li>
|
||||
<li><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
|
||||
network weights.</p></li>
|
||||
<li><p><strong>n_step</strong> – (int)
|
||||
The bootstrap length that will be used when calculating the state values to store in the DND.</p></li>
|
||||
<li><p><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
|
||||
If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
|
||||
when the state was first seen, and not the latest, most up-to-date network value.</li>
|
||||
when the state was first seen, and not the latest, most up-to-date network value.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -307,7 +306,7 @@ when the state was first seen, and not the latest, most up-to-date network value
|
||||
<a href="pal.html" class="btn btn-neutral float-right" title="Persistent Advantage Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="naf.html" class="btn btn-neutral" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="naf.html" class="btn btn-neutral float-left" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -316,7 +315,7 @@ when the state was first seen, and not the latest, most up-to-date network value
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -333,27 +332,16 @@ when the state was first seen, and not the latest, most up-to-date network value
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Persistent Advantage Learning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Persistent Advantage Learning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Neural Episodic Control" href="nec.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,47 +230,43 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Start by calculating the initial target values in the same manner as they are calculated in DDQN
|
||||
<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
|
||||
<li>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Start by calculating the initial target values in the same manner as they are calculated in DDQN
|
||||
<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
|
||||
<li><p>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
|
||||
To calculate the action gap, run the target network using the current states and get the <span class="math notranslate nohighlight">\(Q\)</span> values
|
||||
for all the actions. Then estimate <span class="math notranslate nohighlight">\(V\)</span> as the maximum predicted <span class="math notranslate nohighlight">\(Q\)</span> value for the current state:
|
||||
<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></li>
|
||||
<li>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
|
||||
<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></p></li>
|
||||
<li><p>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
|
||||
the targets <span class="math notranslate nohighlight">\(y_t^{DDQN}\)</span>:
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></li>
|
||||
<li>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></p></li>
|
||||
<li><p>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
|
||||
gap for the next state:
|
||||
<span class="math notranslate nohighlight">\(V(s_{t+1} )-Q(s_{t+1},a_{t+1})\)</span>
|
||||
where <span class="math notranslate nohighlight">\(a_{t+1}\)</span> is chosen by running the next states through the online network and choosing the action that
|
||||
has the highest predicted <span class="math notranslate nohighlight">\(Q\)</span> value. Finally, the targets will be defined as -
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></li>
|
||||
<li>Train the online network using the current states as inputs, and with the aforementioned targets.</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></p></li>
|
||||
<li><p>Train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
|
||||
<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.pal_agent.PALAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.pal_agent.</code><code class="descname">PALAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/pal_agent.html#PALAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.pal_agent.PALAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>pal_alpha</strong> – (float)
|
||||
A factor that weights the amount by which the advantage learning update will be taken into account.</li>
|
||||
<li><strong>persistent_advantage_learning</strong> – (bool)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>pal_alpha</strong> – (float)
|
||||
A factor that weights the amount by which the advantage learning update will be taken into account.</p></li>
|
||||
<li><p><strong>persistent_advantage_learning</strong> – (bool)
|
||||
If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
|
||||
the same actions one after the other instead of changing actions.</li>
|
||||
<li><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
the same actions one after the other instead of changing actions.</p></li>
|
||||
<li><p><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
|
||||
total discounted returns, and they can help reduce the time it takes for the network to update to the newly
|
||||
seen values, since it is not based on bootstrapping the current network values.</li>
|
||||
seen values, since it is not based on bootstrapping the current network values.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -285,7 +284,7 @@ seen values, since it is not based on bootstrapping the current network values.<
|
||||
<a href="../policy_optimization/pg.html" class="btn btn-neutral float-right" title="Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="nec.html" class="btn btn-neutral" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="nec.html" class="btn btn-neutral float-left" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -294,7 +293,7 @@ seen values, since it is not based on bootstrapping the current network values.<
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -311,27 +310,16 @@ seen values, since it is not based on bootstrapping the current network values.<
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Quantile Regression DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Quantile Regression DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Rainbow" href="rainbow.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,33 +230,29 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
|
||||
by following the Bellman equation.
|
||||
Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
|
||||
quantile midpoints targets.</li>
|
||||
<li>The network is trained with the quantile regression loss between the resulting quantile locations and the target
|
||||
quantile locations. Only the targets of the actions that were actually taken are updated.</li>
|
||||
<li>Once in every few thousand steps, weights are copied from the online network to the target network.</li>
|
||||
quantile midpoints targets.</p></li>
|
||||
<li><p>The network is trained with the quantile regression loss between the resulting quantile locations and the target
|
||||
quantile locations. Only the targets of the actions that were actually taken are updated.</p></li>
|
||||
<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.qr_dqn_agent.</code><code class="descname">QuantileRegressionDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/qr_dqn_agent.html#QuantileRegressionDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>atoms</strong> – (int)
|
||||
the number of atoms to predict for each action</li>
|
||||
<li><strong>huber_loss_interval</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>atoms</strong> – (int)
|
||||
the number of atoms to predict for each action</p></li>
|
||||
<li><p><strong>huber_loss_interval</strong> – (float)
|
||||
One of the huber loss parameters, and is referred to as <span class="math notranslate nohighlight">\(\kapa\)</span> in the paper.
|
||||
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</li>
|
||||
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -271,7 +270,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
|
||||
<a href="../../architectures/index.html" class="btn btn-neutral float-right" title="Architectures" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="rainbow.html" class="btn btn-neutral" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="rainbow.html" class="btn btn-neutral float-left" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -280,7 +279,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -297,27 +296,16 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Rainbow — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Rainbow — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Proximal Policy Optimization" href="../policy_optimization/ppo.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -226,19 +229,18 @@
|
||||
<h2>Algorithm Description<a class="headerlink" href="#algorithm-description" title="Permalink to this headline">¶</a></h2>
|
||||
<p>Rainbow combines 6 recent advancements in reinforcement learning:</p>
|
||||
<ul class="simple">
|
||||
<li>N-step returns</li>
|
||||
<li>Distributional state-action value learning</li>
|
||||
<li>Dueling networks</li>
|
||||
<li>Noisy Networks</li>
|
||||
<li>Double DQN</li>
|
||||
<li>Prioritized Experience Replay</li>
|
||||
<li><p>N-step returns</p></li>
|
||||
<li><p>Distributional state-action value learning</p></li>
|
||||
<li><p>Dueling networks</p></li>
|
||||
<li><p>Noisy Networks</p></li>
|
||||
<li><p>Double DQN</p></li>
|
||||
<li><p>Prioritized Experience Replay</p></li>
|
||||
</ul>
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic">
|
||||
<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
|
||||
</li>
|
||||
<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
|
||||
<p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
|
||||
<p>where:
|
||||
@@ -246,36 +248,29 @@ that the <span class="math notranslate nohighlight">\(i-th\)</span> component of
|
||||
* <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom
|
||||
<span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r_t+\gamma r_{t+1} + ... + \gamma r_{t+n-1} + \gamma^{n-1} z_j\)</span></p>
|
||||
</li>
|
||||
<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p>
|
||||
</li>
|
||||
<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
|
||||
</li>
|
||||
<li><p class="first">After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
|
||||
using the KL divergence loss that is returned from the network.</p>
|
||||
</li>
|
||||
<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p></li>
|
||||
<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
|
||||
<li><p>After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
|
||||
using the KL divergence loss that is returned from the network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.rainbow_dqn_agent.</code><code class="descname">RainbowDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/rainbow_dqn_agent.html#RainbowDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>n_step</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>n_step</strong> – (int)
|
||||
The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
|
||||
using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
|
||||
prediction.</li>
|
||||
<li><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
|
||||
prediction.</p></li>
|
||||
<li><p><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
|
||||
If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
|
||||
written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
|
||||
transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
transitions into the memory, and to do so we need the entire episode first.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -293,7 +288,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
<a href="qr_dqn.html" class="btn btn-neutral float-right" title="Quantile Regression DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../policy_optimization/ppo.html" class="btn btn-neutral" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../policy_optimization/ppo.html" class="btn btn-neutral float-left" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -302,7 +297,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -319,27 +314,16 @@ transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user