mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 11:10:20 +01:00
Enabling Coach Documentation to be run even when environments are not installed (#326)
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Additional Parameters — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Additional Parameters — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
|
||||
@@ -32,21 +40,16 @@
|
||||
<link rel="prev" title="Spaces" href="spaces.html" />
|
||||
<link href="../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -193,51 +196,47 @@
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.VisualizationParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">VisualizationParameters</code><span class="sig-paren">(</span><em>print_networks_summary=False</em>, <em>dump_csv=True</em>, <em>dump_signals_to_csv_every_x_episodes=5</em>, <em>dump_gifs=False</em>, <em>dump_mp4=False</em>, <em>video_dump_methods=None</em>, <em>dump_in_episode_signals=False</em>, <em>dump_parameters_documentation=True</em>, <em>render=False</em>, <em>native_rendering=False</em>, <em>max_fps_for_human_control=10</em>, <em>tensorboard=False</em>, <em>add_rendered_image_to_env_response=False</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#VisualizationParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.VisualizationParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>print_networks_summary</strong> – If set to True, a summary of all the networks structure will be printed at the beginning of the experiment</li>
|
||||
<li><strong>dump_csv</strong> – If set to True, the logger will dump logs to a csv file once in every dump_signals_to_csv_every_x_episodes
|
||||
episodes. The logs can be later used to visualize the training process using Coach Dashboard.</li>
|
||||
<li><strong>dump_signals_to_csv_every_x_episodes</strong> – Defines the number of episodes between writing new data to the csv log files. Lower values can affect
|
||||
performance, as writing to disk may take time, and it is done synchronously.</li>
|
||||
<li><strong>dump_gifs</strong> – If set to True, GIF videos of the environment will be stored into the experiment directory according to
|
||||
the filters defined in video_dump_methods.</li>
|
||||
<li><strong>dump_mp4</strong> – If set to True, MP4 videos of the environment will be stored into the experiment directory according to
|
||||
the filters defined in video_dump_methods.</li>
|
||||
<li><strong>dump_in_episode_signals</strong> – If set to True, csv files will be dumped for each episode for inspecting different metrics within the
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>print_networks_summary</strong> – If set to True, a summary of all the networks structure will be printed at the beginning of the experiment</p></li>
|
||||
<li><p><strong>dump_csv</strong> – If set to True, the logger will dump logs to a csv file once in every dump_signals_to_csv_every_x_episodes
|
||||
episodes. The logs can be later used to visualize the training process using Coach Dashboard.</p></li>
|
||||
<li><p><strong>dump_signals_to_csv_every_x_episodes</strong> – Defines the number of episodes between writing new data to the csv log files. Lower values can affect
|
||||
performance, as writing to disk may take time, and it is done synchronously.</p></li>
|
||||
<li><p><strong>dump_gifs</strong> – If set to True, GIF videos of the environment will be stored into the experiment directory according to
|
||||
the filters defined in video_dump_methods.</p></li>
|
||||
<li><p><strong>dump_mp4</strong> – If set to True, MP4 videos of the environment will be stored into the experiment directory according to
|
||||
the filters defined in video_dump_methods.</p></li>
|
||||
<li><p><strong>dump_in_episode_signals</strong> – If set to True, csv files will be dumped for each episode for inspecting different metrics within the
|
||||
episode. This means that for each step in each episode, different metrics such as the reward, the
|
||||
future return, etc. will be saved. Setting this to True may affect performance severely, and therefore
|
||||
this should be used only for debugging purposes.</li>
|
||||
<li><strong>dump_parameters_documentation</strong> – If set to True, a json file containing all the agent parameters will be saved in the experiment directory.
|
||||
this should be used only for debugging purposes.</p></li>
|
||||
<li><p><strong>dump_parameters_documentation</strong> – If set to True, a json file containing all the agent parameters will be saved in the experiment directory.
|
||||
This may be very useful for inspecting the values defined for each parameters and making sure that all
|
||||
the parameters are defined as expected.</li>
|
||||
<li><strong>render</strong> – If set to True, the environment render function will be called for each step, rendering the image of the
|
||||
the parameters are defined as expected.</p></li>
|
||||
<li><p><strong>render</strong> – If set to True, the environment render function will be called for each step, rendering the image of the
|
||||
environment. This may affect the performance of training, and is highly dependent on the environment.
|
||||
By default, Coach uses PyGame to render the environment image instead of the environment specific rendered.
|
||||
To change this, use the native_rendering flag.</li>
|
||||
<li><strong>native_rendering</strong> – If set to True, the environment native renderer will be used for rendering the environment image.
|
||||
To change this, use the native_rendering flag.</p></li>
|
||||
<li><p><strong>native_rendering</strong> – If set to True, the environment native renderer will be used for rendering the environment image.
|
||||
In some cases this can be slower than rendering using PyGame through Coach, but in other cases the
|
||||
environment opens its native renderer by default, so rendering with PyGame is an unnecessary overhead.</li>
|
||||
<li><strong>max_fps_for_human_control</strong> – The maximum number of frames per second used while playing the environment as a human. This only has
|
||||
effect while using the –play flag for Coach.</li>
|
||||
<li><strong>tensorboard</strong> – If set to True, TensorBoard summaries will be stored in the experiment directory. This can later be
|
||||
loaded in TensorBoard in order to visualize the training process.</li>
|
||||
<li><strong>video_dump_methods</strong> – A list of dump methods that will be used as filters for deciding when to save videos.
|
||||
environment opens its native renderer by default, so rendering with PyGame is an unnecessary overhead.</p></li>
|
||||
<li><p><strong>max_fps_for_human_control</strong> – The maximum number of frames per second used while playing the environment as a human. This only has
|
||||
effect while using the –play flag for Coach.</p></li>
|
||||
<li><p><strong>tensorboard</strong> – If set to True, TensorBoard summaries will be stored in the experiment directory. This can later be
|
||||
loaded in TensorBoard in order to visualize the training process.</p></li>
|
||||
<li><p><strong>video_dump_methods</strong> – A list of dump methods that will be used as filters for deciding when to save videos.
|
||||
The filters in the list will be checked one after the other until the first dump method that returns
|
||||
false for should_dump() in the environment class. This list will only be used if dump_mp4 or dump_gif are
|
||||
set to True.</li>
|
||||
<li><strong>add_rendered_image_to_env_response</strong> – Some environments have a different observation compared to the one displayed while rendering.
|
||||
set to True.</p></li>
|
||||
<li><p><strong>add_rendered_image_to_env_response</strong> – Some environments have a different observation compared to the one displayed while rendering.
|
||||
For some cases it can be useful to pass the rendered image to the agent for visualization purposes.
|
||||
If this flag is set to True, the rendered image will be added to the environment EnvResponse object,
|
||||
which will be passed to the agent and allow using those images.</li>
|
||||
which will be passed to the agent and allow using those images.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -246,29 +245,25 @@ which will be passed to the agent and allow using those images.</li>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.PresetValidationParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">PresetValidationParameters</code><span class="sig-paren">(</span><em>test=False</em>, <em>min_reward_threshold=0</em>, <em>max_episodes_to_achieve_reward=1</em>, <em>num_workers=1</em>, <em>reward_test_level=None</em>, <em>test_using_a_trace_test=True</em>, <em>trace_test_levels=None</em>, <em>trace_max_env_steps=5000</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#PresetValidationParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.PresetValidationParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>test</strong> – A flag which specifies if the preset should be tested as part of the validation process.</li>
|
||||
<li><strong>min_reward_threshold</strong> – The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
|
||||
preset is run.</li>
|
||||
<li><strong>max_episodes_to_achieve_reward</strong> – The maximum number of episodes that the agent should train using the preset in order to achieve the
|
||||
reward specified by min_reward_threshold.</li>
|
||||
<li><strong>num_workers</strong> – The number of workers that should be used when running this preset in the test suite for validation.</li>
|
||||
<li><strong>reward_test_level</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
|
||||
reward tests suite.</li>
|
||||
<li><strong>test_using_a_trace_test</strong> – A flag that specifies if the preset should be run as part of the trace tests suite.</li>
|
||||
<li><strong>trace_test_levels</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
|
||||
trace tests suite.</li>
|
||||
<li><strong>trace_max_env_steps</strong> – An integer representing the maximum number of environment steps to run when running this preset as part
|
||||
of the trace tests suite.</li>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>test</strong> – A flag which specifies if the preset should be tested as part of the validation process.</p></li>
|
||||
<li><p><strong>min_reward_threshold</strong> – The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
|
||||
preset is run.</p></li>
|
||||
<li><p><strong>max_episodes_to_achieve_reward</strong> – The maximum number of episodes that the agent should train using the preset in order to achieve the
|
||||
reward specified by min_reward_threshold.</p></li>
|
||||
<li><p><strong>num_workers</strong> – The number of workers that should be used when running this preset in the test suite for validation.</p></li>
|
||||
<li><p><strong>reward_test_level</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
|
||||
reward tests suite.</p></li>
|
||||
<li><p><strong>test_using_a_trace_test</strong> – A flag that specifies if the preset should be run as part of the trace tests suite.</p></li>
|
||||
<li><p><strong>trace_test_levels</strong> – The environment level or levels, given by a list of strings, that should be tested as part of the
|
||||
trace tests suite.</p></li>
|
||||
<li><p><strong>trace_max_env_steps</strong> – An integer representing the maximum number of environment steps to run when running this preset as part
|
||||
of the trace tests suite.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -277,30 +272,26 @@ of the trace tests suite.</li>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.TaskParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">TaskParameters</code><span class="sig-paren">(</span><em>framework_type: rl_coach.base_parameters.Frameworks = <Frameworks.tensorflow: 'TensorFlow'></em>, <em>evaluate_only: int = None</em>, <em>use_cpu: bool = False</em>, <em>experiment_path='/tmp'</em>, <em>seed=None</em>, <em>checkpoint_save_secs=None</em>, <em>checkpoint_restore_dir=None</em>, <em>checkpoint_restore_path=None</em>, <em>checkpoint_save_dir=None</em>, <em>export_onnx_graph: bool = False</em>, <em>apply_stop_condition: bool = False</em>, <em>num_gpu: int = 1</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#TaskParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.TaskParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</li>
|
||||
<li><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
|
||||
A value of 0 means that task will be evaluated for an infinite number of steps.</li>
|
||||
<li><strong>use_cpu</strong> – use the cpu for this task</li>
|
||||
<li><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</li>
|
||||
<li><strong>seed</strong> – a seed to use for the random numbers generator</li>
|
||||
<li><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</li>
|
||||
<li><strong>checkpoint_restore_dir</strong> – [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
|
||||
the dir to restore the checkpoints from</li>
|
||||
<li><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</li>
|
||||
<li><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</li>
|
||||
<li><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</li>
|
||||
<li><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</li>
|
||||
<li><strong>num_gpu</strong> – number of GPUs to use</li>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</p></li>
|
||||
<li><p><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
|
||||
A value of 0 means that task will be evaluated for an infinite number of steps.</p></li>
|
||||
<li><p><strong>use_cpu</strong> – use the cpu for this task</p></li>
|
||||
<li><p><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</p></li>
|
||||
<li><p><strong>seed</strong> – a seed to use for the random numbers generator</p></li>
|
||||
<li><p><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</p></li>
|
||||
<li><p><strong>checkpoint_restore_dir</strong> – [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
|
||||
the dir to restore the checkpoints from</p></li>
|
||||
<li><p><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</p></li>
|
||||
<li><p><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</p></li>
|
||||
<li><p><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</p></li>
|
||||
<li><p><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</p></li>
|
||||
<li><p><strong>num_gpu</strong> – number of GPUs to use</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -309,35 +300,31 @@ the dir to restore the checkpoints from</li>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.DistributedTaskParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">DistributedTaskParameters</code><span class="sig-paren">(</span><em>framework_type: rl_coach.base_parameters.Frameworks</em>, <em>parameters_server_hosts: str</em>, <em>worker_hosts: str</em>, <em>job_type: str</em>, <em>task_index: int</em>, <em>evaluate_only: int = None</em>, <em>num_tasks: int = None</em>, <em>num_training_tasks: int = None</em>, <em>use_cpu: bool = False</em>, <em>experiment_path=None</em>, <em>dnd=None</em>, <em>shared_memory_scratchpad=None</em>, <em>seed=None</em>, <em>checkpoint_save_secs=None</em>, <em>checkpoint_restore_path=None</em>, <em>checkpoint_save_dir=None</em>, <em>export_onnx_graph: bool = False</em>, <em>apply_stop_condition: bool = False</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/base_parameters.html#DistributedTaskParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.DistributedTaskParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</li>
|
||||
<li><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
|
||||
A value of 0 means that task will be evaluated for an infinite number of steps.</li>
|
||||
<li><strong>parameters_server_hosts</strong> – comma-separated list of hostname:port pairs to which the parameter servers are
|
||||
assigned</li>
|
||||
<li><strong>worker_hosts</strong> – comma-separated list of hostname:port pairs to which the workers are assigned</li>
|
||||
<li><strong>job_type</strong> – the job type - either ps (short for parameters server) or worker</li>
|
||||
<li><strong>task_index</strong> – the index of the process</li>
|
||||
<li><strong>num_tasks</strong> – the number of total tasks that are running (not including the parameters server)</li>
|
||||
<li><strong>num_training_tasks</strong> – the number of tasks that are training (not including the parameters server)</li>
|
||||
<li><strong>use_cpu</strong> – use the cpu for this task</li>
|
||||
<li><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</li>
|
||||
<li><strong>dnd</strong> – an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.</li>
|
||||
<li><strong>seed</strong> – a seed to use for the random numbers generator</li>
|
||||
<li><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</li>
|
||||
<li><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</li>
|
||||
<li><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</li>
|
||||
<li><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</li>
|
||||
<li><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</li>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>framework_type</strong> – deep learning framework type. currently only tensorflow is supported</p></li>
|
||||
<li><p><strong>evaluate_only</strong> – if not None, the task will be used only for evaluating the model for the given number of steps.
|
||||
A value of 0 means that task will be evaluated for an infinite number of steps.</p></li>
|
||||
<li><p><strong>parameters_server_hosts</strong> – comma-separated list of hostname:port pairs to which the parameter servers are
|
||||
assigned</p></li>
|
||||
<li><p><strong>worker_hosts</strong> – comma-separated list of hostname:port pairs to which the workers are assigned</p></li>
|
||||
<li><p><strong>job_type</strong> – the job type - either ps (short for parameters server) or worker</p></li>
|
||||
<li><p><strong>task_index</strong> – the index of the process</p></li>
|
||||
<li><p><strong>num_tasks</strong> – the number of total tasks that are running (not including the parameters server)</p></li>
|
||||
<li><p><strong>num_training_tasks</strong> – the number of tasks that are training (not including the parameters server)</p></li>
|
||||
<li><p><strong>use_cpu</strong> – use the cpu for this task</p></li>
|
||||
<li><p><strong>experiment_path</strong> – the path to the directory which will store all the experiment outputs</p></li>
|
||||
<li><p><strong>dnd</strong> – an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.</p></li>
|
||||
<li><p><strong>seed</strong> – a seed to use for the random numbers generator</p></li>
|
||||
<li><p><strong>checkpoint_save_secs</strong> – the number of seconds between each checkpoint saving</p></li>
|
||||
<li><p><strong>checkpoint_restore_path</strong> – the path to restore the checkpoints from</p></li>
|
||||
<li><p><strong>checkpoint_save_dir</strong> – the directory to store the checkpoints in</p></li>
|
||||
<li><p><strong>export_onnx_graph</strong> – If set to True, this will export an onnx graph each time a checkpoint is saved</p></li>
|
||||
<li><p><strong>apply_stop_condition</strong> – If set to True, this will apply the stop condition defined by reaching a target success rate</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -352,7 +339,7 @@ assigned</li>
|
||||
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
|
||||
|
||||
|
||||
<a href="spaces.html" class="btn btn-neutral" title="Spaces" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="spaces.html" class="btn btn-neutral float-left" title="Spaces" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -361,7 +348,7 @@ assigned</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -378,27 +365,16 @@ assigned</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Behavioral Cloning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Behavioral Cloning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="ACER" href="../policy_optimization/acer.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -230,9 +233,9 @@ These demonstrations are given as state, action tuples, and with no reward.
|
||||
The training goal is to reduce the difference between the actions predicted by the network and the actions taken by
|
||||
the expert for each state.</p>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Use the current states as input to the network, and the expert actions as the targets of the network.</li>
|
||||
<li>For the network head, we use the policy head, which uses the cross entropy loss function.</li>
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Use the current states as input to the network, and the expert actions as the targets of the network.</p></li>
|
||||
<li><p>For the network head, we use the policy head, which uses the cross entropy loss function.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.bc_agent.BCAlgorithmParameters">
|
||||
@@ -254,7 +257,7 @@ the expert for each state.</p>
|
||||
<a href="../value_optimization/bs_dqn.html" class="btn btn-neutral float-right" title="Bootstrapped DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../policy_optimization/acer.html" class="btn btn-neutral" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../policy_optimization/acer.html" class="btn btn-neutral float-left" title="ACER" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -263,7 +266,7 @@ the expert for each state.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -280,27 +283,16 @@ the expert for each state.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Conditional Imitation Learning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Conditional Imitation Learning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Categorical DQN" href="../value_optimization/categorical_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -233,25 +236,22 @@ the expert for each state.
|
||||
In conditional imitation learning, each transition is assigned a class, which determines the goal that was pursuit
|
||||
in that transitions. For example, 3 possible classes can be: turn right, turn left and follow lane.</p>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
|
||||
of transitions will be sampled from each class index.</li>
|
||||
<li>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
|
||||
<li><p>Sample a batch of transitions from the replay buffer, where the batch is balanced, meaning that an equal number
|
||||
of transitions will be sampled from each class index.</p></li>
|
||||
<li><p>Use the current states as input to the network, and assign the expert actions as the targets of the network heads
|
||||
corresponding to the state classes. For the other heads, set the targets to match the currently predicted values,
|
||||
so that the loss for the other heads will be zeroed out.</li>
|
||||
<li>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</li>
|
||||
so that the loss for the other heads will be zeroed out.</p></li>
|
||||
<li><p>We use a regression head, that minimizes the MSE loss between the network predicted values and the target values.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.cil_agent.CILAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.cil_agent.</code><code class="descname">CILAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/cil_agent.html#CILAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.cil_agent.CILAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state_key_with_the_class_index</strong> – (str)
|
||||
The key of the state dictionary which corresponds to the value that will be used to control the class index.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>state_key_with_the_class_index</strong> – (str)
|
||||
The key of the state dictionary which corresponds to the value that will be used to control the class index.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -269,7 +269,7 @@ The key of the state dictionary which corresponds to the value that will be used
|
||||
<a href="../policy_optimization/cppo.html" class="btn btn-neutral float-right" title="Clipped Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../value_optimization/categorical_dqn.html" class="btn btn-neutral float-left" title="Categorical DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -278,7 +278,7 @@ The key of the state dictionary which corresponds to the value that will be used
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -295,27 +295,16 @@ The key of the state dictionary which corresponds to the value that will be used
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Agents — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Agents — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Adding a New Environment" href="../../contributing/add_env.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -241,59 +244,50 @@ A detailed description of those algorithms can be found by navigating to each of
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.AgentParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">AgentParameters</code><span class="sig-paren">(</span><em>algorithm: rl_coach.base_parameters.AlgorithmParameters, exploration: ExplorationParameters, memory: MemoryParameters, networks: Dict[str, rl_coach.base_parameters.NetworkParameters], visualization: rl_coach.base_parameters.VisualizationParameters = <rl_coach.base_parameters.VisualizationParameters object></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/base_parameters.html#AgentParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.AgentParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>algorithm</strong> – A class inheriting AlgorithmParameters.
|
||||
The parameters used for the specific algorithm used by the agent.
|
||||
These parameters can be later referenced in the agent implementation through self.ap.algorithm.</li>
|
||||
<li><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
|
||||
These parameters can be later referenced in the agent implementation through self.ap.algorithm.</p></li>
|
||||
<li><p><strong>exploration</strong> – Either a class inheriting ExplorationParameters or a dictionary mapping between action
|
||||
space types and their corresponding ExplorationParameters. If a dictionary was used,
|
||||
when the agent will be instantiated, the correct exploration policy parameters will be used
|
||||
according to the real type of the environment action space.
|
||||
These parameters will be used to instantiate the exporation policy.</li>
|
||||
<li><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</li>
|
||||
<li><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
|
||||
These parameters will be used to instantiate the exporation policy.</p></li>
|
||||
<li><p><strong>memory</strong> – A class inheriting MemoryParameters. It defines all the parameters used by the memory module.</p></li>
|
||||
<li><p><strong>networks</strong> – A dictionary mapping between network names and their corresponding network parmeters, defined
|
||||
as a class inheriting NetworkParameters. Each element will be used in order to instantiate
|
||||
a NetworkWrapper class, and all the network wrappers will be stored in the agent under
|
||||
self.network_wrappers. self.network_wrappers is a dict mapping between the network name that
|
||||
was given in the networks dict, and the instantiated network wrapper.</li>
|
||||
<li><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
|
||||
used for visualization purposes, such as printing to the screen, rendering, and saving videos.</li>
|
||||
was given in the networks dict, and the instantiated network wrapper.</p></li>
|
||||
<li><p><strong>visualization</strong> – A class inheriting VisualizationParameters and defining various parameters that can be
|
||||
used for visualization purposes, such as printing to the screen, rendering, and saving videos.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.agent.Agent">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.agent.</code><code class="descname">Agent</code><span class="sig-paren">(</span><em>agent_parameters: rl_coach.base_parameters.AgentParameters</em>, <em>parent: Union[LevelManager</em>, <em>CompositeAgent] = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>agent_parameters</strong> – A AgentParameters class instance with all the agent parameters</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.act">
|
||||
<code class="descname">act</code><span class="sig-paren">(</span><em>action: Union[None</em>, <em>int</em>, <em>float</em>, <em>numpy.ndarray</em>, <em>List] = None</em><span class="sig-paren">)</span> → rl_coach.core_types.ActionInfo<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.act"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.act" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given the agents current knowledge, decide on the next action to apply to the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – An action to take, overriding whatever the current policy is</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">An ActionInfo object, which contains the action and any additional info from the action decision process</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – An action to take, overriding whatever the current policy is</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>An ActionInfo object, which contains the action and any additional info from the action decision process</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -302,21 +296,17 @@ used for visualization purposes, such as printing to the screen, rendering, and
|
||||
<dd><p>This function is a wrapper to allow having the same calls for shared or unshared memories.
|
||||
It should be used instead of calling the memory directly in order to allow different algorithms to work
|
||||
both with a shared and a local memory.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>func</strong> – the name of the memory function to call</li>
|
||||
<li><strong>args</strong> – the arguments to supply to the function</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>func</strong> – the name of the memory function to call</p></li>
|
||||
<li><p><strong>args</strong> – the arguments to supply to the function</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the return value of the function</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the return value of the function</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -324,16 +314,14 @@ both with a shared and a local memory.</p>
|
||||
<code class="descname">choose_action</code><span class="sig-paren">(</span><em>curr_state</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.choose_action"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.choose_action" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>choose an action to act with in the current episode being played. Different behavior might be exhibited when
|
||||
training or testing.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>curr_state</strong> – the current state to act upon.</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">chosen action, some action value describing the action (q-value, probability, etc)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>curr_state</strong> – the current state to act upon.</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>chosen action, some action value describing the action (q-value, probability, etc)</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -351,14 +339,11 @@ training or testing.</p>
|
||||
<dd><p>Create all the networks of the agent.
|
||||
The network creation will be done after setting the environment parameters for the agent, since they are needed
|
||||
for creating the network.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A list containing all the networks</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A list containing all the networks</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -367,37 +352,31 @@ for creating the network.</p>
|
||||
<dd><p>Get a prediction from the agent with regard to the requested prediction_type.
|
||||
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
|
||||
raise a ValueException.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>states</strong> – The states to get a prediction for</li>
|
||||
<li><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>states</strong> – The states to get a prediction for</p></li>
|
||||
<li><p><strong>prediction_type</strong> – The type of prediction to get for the states. For example, the state-value prediction.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the predicted values</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the predicted values</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.get_state_embedding">
|
||||
<code class="descname">get_state_embedding</code><span class="sig-paren">(</span><em>state: dict</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.get_state_embedding"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.get_state_embedding" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a state, get the corresponding state embedding from the main network</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – a state dict</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy embedding vector</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>state</strong> – a state dict</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy embedding vector</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -406,14 +385,11 @@ raise a ValueException.</p>
|
||||
<dd><p>Make any changes needed when each episode is ended.
|
||||
This includes incrementing counters, updating full episode dependent values, updating logs, etc.
|
||||
This function is called right after each episode is ended.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -421,44 +397,36 @@ This function is called right after each episode is ended.</p>
|
||||
<code class="descname">init_environment_dependent_modules</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.init_environment_dependent_modules"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.init_environment_dependent_modules" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Initialize any modules that depend on knowing information about the environment such as the action space or
|
||||
the observation space</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.learn_from_batch">
|
||||
<code class="descname">learn_from_batch</code><span class="sig-paren">(</span><em>batch</em><span class="sig-paren">)</span> → Tuple[float, List, List]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.learn_from_batch"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.learn_from_batch" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a batch of transitions, calculates their target values and updates the network.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>batch</strong> – A list of transitions</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The total loss of the training, the loss per head and the unclipped gradients</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>batch</strong> – A list of transitions</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>The total loss of the training, the loss per head and the unclipped gradients</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.log_to_screen">
|
||||
<code class="descname">log_to_screen</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.log_to_screen"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.log_to_screen" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Write an episode summary line to the terminal</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -467,59 +435,48 @@ the observation space</p>
|
||||
<dd><p>Given a response from the environment, distill the observation from it and store it for later use.
|
||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
||||
the reward, a game over flag and any additional information necessary.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>env_response</strong> – result of call from environment.step(action)</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a boolean value which determines if the agent has decided to terminate the episode after seeing the
|
||||
given observation</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>env_response</strong> – result of call from environment.step(action)</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a boolean value which determines if the agent has decided to terminate the episode after seeing the
|
||||
given observation</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.agents.agent.Agent.parent">
|
||||
<code class="descname">parent</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.parent" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the parent class of the agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the current phase</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>the current phase</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.agents.agent.Agent.phase">
|
||||
<code class="descname">phase</code><a class="headerlink" href="#rl_coach.agents.agent.Agent.phase" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>The current running phase of the agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">RunPhase</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>RunPhase</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.post_training_commands">
|
||||
<code class="descname">post_training_commands</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.post_training_commands"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.post_training_commands" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A function which allows adding any functionality that is required to run right after the training phase ends.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -527,45 +484,37 @@ given observation</td>
|
||||
<code class="descname">prepare_batch_for_inference</code><span class="sig-paren">(</span><em>states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str</em><span class="sig-paren">)</span> → Dict[str, numpy.array]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.prepare_batch_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.prepare_batch_for_inference" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
observations together, measurements together, etc.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
|
||||
corresponding observation</li>
|
||||
<li><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
|
||||
the observation relevant for the network from the states.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>states</strong> – A list of environment states, where each one is a dict mapping from an observation name to its
|
||||
corresponding observation</p></li>
|
||||
<li><p><strong>network_name</strong> – The agent network name to prepare the batch for. this is needed in order to extract only
|
||||
the observation relevant for the network from the states.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A dictionary containing a list of values from all the given states for each of the observations</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>A dictionary containing a list of values from all the given states for each of the observations</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.register_signal">
|
||||
<code class="descname">register_signal</code><span class="sig-paren">(</span><em>signal_name: str</em>, <em>dump_one_value_per_episode: bool = True</em>, <em>dump_one_value_per_step: bool = False</em><span class="sig-paren">)</span> → rl_coach.utils.Signal<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.register_signal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.register_signal" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Register a signal such that its statistics will be dumped and be viewable through dashboard</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</li>
|
||||
<li><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</li>
|
||||
<li><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>signal_name</strong> – the name of the signal as it will appear in dashboard</p></li>
|
||||
<li><p><strong>dump_one_value_per_episode</strong> – should the signal value be written for each episode?</p></li>
|
||||
<li><p><strong>dump_one_value_per_step</strong> – should the signal value be written for each step?</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the created signal</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the created signal</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -574,46 +523,39 @@ the observation relevant for the network from the states.</li>
|
||||
<dd><p>Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
|
||||
evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
|
||||
by val, and by the current phase set in self.phase.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – The new phase to change to</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>val</strong> – The new phase to change to</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.reset_internal_state">
|
||||
<code class="descname">reset_internal_state</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.reset_internal_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.reset_internal_state" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Reset all the episodic parameters. This function is called right before each episode starts.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.restore_checkpoint">
|
||||
<code class="descname">restore_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_dir: str</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.restore_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.restore_checkpoint" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows agents to store additional information when saving checkpoints.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>checkpoint_dir</strong> – The checkpoint dir to restore from</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -621,51 +563,42 @@ by val, and by the current phase set in self.phase.</p>
|
||||
<code class="descname">run_off_policy_evaluation</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="headerlink" href="#rl_coach.agents.agent.Agent.run_off_policy_evaluation" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
|
||||
Should only be implemented for off-policy RL algorithms.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference">
|
||||
<code class="descname">run_pre_network_filter_for_inference</code><span class="sig-paren">(</span><em>state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True</em><span class="sig-paren">)</span> → Dict[str, numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.run_pre_network_filter_for_inference"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.run_pre_network_filter_for_inference" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Run filters which where defined for being applied right before using the state for inference.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>state</strong> – The state to run the filters on</li>
|
||||
<li><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>state</strong> – The state to run the filters on</p></li>
|
||||
<li><p><strong>update_filter_internal_state</strong> – Should update the filter’s internal state - should not update when evaluating</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">The filtered state</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>The filtered state</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.save_checkpoint">
|
||||
<code class="descname">save_checkpoint</code><span class="sig-paren">(</span><em>checkpoint_prefix: str</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.save_checkpoint"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.save_checkpoint" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows agents to store additional information when saving checkpoints.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>checkpoint_prefix</strong> – The prefix of the checkpoint file to save</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -673,16 +606,14 @@ Should only be implemented for off-policy RL algorithms.</p>
|
||||
<code class="descname">set_environment_parameters</code><span class="sig-paren">(</span><em>spaces: rl_coach.spaces.SpacesDefinition</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_environment_parameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_environment_parameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>spaces</strong> – the environment spaces definition</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>spaces</strong> – the environment spaces definition</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -692,58 +623,47 @@ dependent on those values, by calling init_environment_dependent_modules</p>
|
||||
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
|
||||
slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent
|
||||
in-action-space.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – The action that should be set as the directive</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – The action that should be set as the directive</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.set_session">
|
||||
<code class="descname">set_session</code><span class="sig-paren">(</span><em>sess</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.set_session"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.set_session" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Set the deep learning framework session for all the agents in the composite agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.setup_logger">
|
||||
<code class="descname">setup_logger</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.setup_logger"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.setup_logger" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Setup the logger for the agent</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.sync">
|
||||
<code class="descname">sync</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.sync"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.sync" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sync the global network parameters to local networks</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -752,14 +672,11 @@ in-action-space.</p>
|
||||
<dd><p>Check if a training phase should be done as configured by num_consecutive_playing_steps.
|
||||
If it should, then do several training steps as configured by num_consecutive_training_steps.
|
||||
A single training iteration: Sample a batch, train on it and update target networks.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The total training loss during the training iterations.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The total training loss during the training iterations.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -768,28 +685,22 @@ A single training iteration: Sample a batch, train on it and update target netwo
|
||||
<dd><p>Updates the episodic log file with all the signal values from the most recent episode.
|
||||
Additional signals for logging can be set by the creating a new signal using self.register_signal,
|
||||
and then updating it with some internal agent values.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.agents.agent.Agent.update_step_in_episode_log">
|
||||
<code class="descname">update_step_in_episode_log</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_step_in_episode_log"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_step_in_episode_log" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Updates the in-episode log file with all the signal values from the most recent step.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -797,16 +708,14 @@ and then updating it with some internal agent values.</p>
|
||||
<code class="descname">update_transition_before_adding_to_replay_buffer</code><span class="sig-paren">(</span><em>transition: rl_coach.core_types.Transition</em><span class="sig-paren">)</span> → rl_coach.core_types.Transition<a class="reference internal" href="../../_modules/rl_coach/agents/agent.html#Agent.update_transition_before_adding_to_replay_buffer"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.agent.Agent.update_transition_before_adding_to_replay_buffer" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition</strong> – the transition to update</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the updated transition</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>transition</strong> – the transition to update</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the updated transition</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -824,7 +733,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
<a href="policy_optimization/ac.html" class="btn btn-neutral float-right" title="Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../../contributing/add_env.html" class="btn btn-neutral" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../../contributing/add_env.html" class="btn btn-neutral float-left" title="Adding a New Environment" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -833,7 +742,7 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -850,27 +759,16 @@ Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Direct Future Prediction — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Direct Future Prediction — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Soft Actor-Critic" href="../policy_optimization/sac.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -228,13 +231,13 @@
|
||||
<div class="section" id="choosing-an-action">
|
||||
<h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
|
||||
<li><p>The current states (observations and measurements) and the corresponding goal vector are passed as an input to the network.
|
||||
The output of the network is the predicted future measurements for time-steps <span class="math notranslate nohighlight">\(t+1,t+2,t+4,t+8,t+16\)</span> and
|
||||
<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</li>
|
||||
<li>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
|
||||
and the result is a single vector of future values for each action.</li>
|
||||
<li>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</li>
|
||||
<li>The action values are passed to the exploration policy to decide on the action to use.</li>
|
||||
<span class="math notranslate nohighlight">\(t+32\)</span> for each possible action.</p></li>
|
||||
<li><p>For each action, the measurements of each predicted time-step are multiplied by the goal vector,
|
||||
and the result is a single vector of future values for each action.</p></li>
|
||||
<li><p>Then, a weighted sum of the future values of each action is calculated, and the result is a single value for each action.</p></li>
|
||||
<li><p>The action values are passed to the exploration policy to decide on the action to use.</p></li>
|
||||
</ol>
|
||||
</div>
|
||||
<div class="section" id="training-the-network">
|
||||
@@ -247,39 +250,35 @@ For the actions that were not taken, the targets are the current values.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.dfp_agent.DFPAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.dfp_agent.</code><code class="descname">DFPAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/dfp_agent.html#DFPAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.dfp_agent.DFPAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_predicted_steps_ahead</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_predicted_steps_ahead</strong> – (int)
|
||||
Number of future steps to predict measurements for. The future steps won’t be sequential, but rather jump
|
||||
in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4.
|
||||
The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</li>
|
||||
<li><strong>goal_vector</strong> – (List[float])
|
||||
The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]</p></li>
|
||||
<li><p><strong>goal_vector</strong> – (List[float])
|
||||
The goal vector will weight each of the measurements to form an optimization goal. The vector should have
|
||||
the same length as the number of measurements, and it will be vector multiplied by the measurements.
|
||||
Positive values correspond to trying to maximize the particular measurement, and negative values
|
||||
correspond to trying to minimize the particular measurement.</li>
|
||||
<li><strong>future_measurements_weights</strong> – (List[float])
|
||||
correspond to trying to minimize the particular measurement.</p></li>
|
||||
<li><p><strong>future_measurements_weights</strong> – (List[float])
|
||||
The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
|
||||
goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
|
||||
then only the 3 last timesteps will be taken into account, according to the weights in the
|
||||
future_measurements_weights vector.</li>
|
||||
<li><strong>use_accumulated_reward_as_measurement</strong> – (bool)
|
||||
future_measurements_weights vector.</p></li>
|
||||
<li><p><strong>use_accumulated_reward_as_measurement</strong> – (bool)
|
||||
If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
|
||||
the measurements vector in the state. This van be useful in environments where the given measurements don’t
|
||||
include enough information for the particular goal the agent should achieve.</li>
|
||||
<li><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
|
||||
Dictates how to handle measurements that are outside the episode length.</li>
|
||||
<li><strong>scale_measurements_targets</strong> – (Dict[str, float])
|
||||
include enough information for the particular goal the agent should achieve.</p></li>
|
||||
<li><p><strong>handling_targets_after_episode_end</strong> – (HandlingTargetsAfterEpisodeEnd)
|
||||
Dictates how to handle measurements that are outside the episode length.</p></li>
|
||||
<li><p><strong>scale_measurements_targets</strong> – (Dict[str, float])
|
||||
Allows rescaling the values of each of the measurements available. This van be useful when the measurements
|
||||
have a different scale and you want to normalize them to the same scale.</li>
|
||||
have a different scale and you want to normalize them to the same scale.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -297,7 +296,7 @@ have a different scale and you want to normalize them to the same scale.</li>
|
||||
<a href="../value_optimization/double_dqn.html" class="btn btn-neutral float-right" title="Double DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../policy_optimization/sac.html" class="btn btn-neutral" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../policy_optimization/sac.html" class="btn btn-neutral float-left" title="Soft Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -306,7 +305,7 @@ have a different scale and you want to normalize them to the same scale.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -323,27 +322,16 @@ have a different scale and you want to normalize them to the same scale.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Actor-Critic — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Actor-Critic — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Agents" href="../index.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -235,41 +238,37 @@ distribution assigned with these probabilities. When testing, the action with th
|
||||
<p>A batch of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions is used, and the advantages are calculated upon it.</p>
|
||||
<p>Advantages can be calculated by either of the following methods (configured by the selected preset) -</p>
|
||||
<ol class="arabic simple">
|
||||
<li><strong>A_VALUE</strong> - Estimating advantage directly:
|
||||
<li><p><strong>A_VALUE</strong> - Estimating advantage directly:
|
||||
<span class="math notranslate nohighlight">\(A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)\)</span>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</li>
|
||||
<li><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</li>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch.</p></li>
|
||||
<li><p><strong>GAE</strong> - By following the <a class="reference external" href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a> paper.</p></li>
|
||||
</ol>
|
||||
<p>The advantages are then used in order to accumulate gradients according to
|
||||
<span class="math notranslate nohighlight">\(L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]\)</span></p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.actor_critic_agent.</code><code class="descname">ActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/actor_critic_agent.html#ActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
The value that will be used to rescale the policy gradient</li>
|
||||
<li><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
The value that will be used to rescale the policy gradient</p></li>
|
||||
<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
The number of episodes to wait before applying the accumulated gradients to the network.
|
||||
The training iterations only accumulate gradients without actually applying them.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.</li>
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
The training iterations only accumulate gradients without actually applying them.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.</p></li>
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
|
||||
<li><strong>gae_lambda</strong> – (float)
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
|
||||
<li><p><strong>gae_lambda</strong> – (float)
|
||||
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</li>
|
||||
<li><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.</li>
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.</p></li>
|
||||
<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -287,7 +286,7 @@ If set to True, the state value targets for the V head will be estimated using t
|
||||
<a href="acer.html" class="btn btn-neutral float-right" title="ACER" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../index.html" class="btn btn-neutral" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../index.html" class="btn btn-neutral float-left" title="Agents" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -296,7 +295,7 @@ If set to True, the state value targets for the V head will be estimated using t
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -313,27 +312,16 @@ If set to True, the state value targets for the V head will be estimated using t
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>ACER — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>ACER — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Actor-Critic" href="ac.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -236,11 +239,11 @@ distribution assigned with these probabilities. When testing, the action with th
|
||||
and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-policy updates from batches of <span class="math notranslate nohighlight">\(T_{max}\)</span> transitions sampled from the replay buffer.</p>
|
||||
<p>Each update perform the following procedure:</p>
|
||||
<ol class="arabic">
|
||||
<li><p class="first"><strong>Calculate state values:</strong></p>
|
||||
<li><p><strong>Calculate state values:</strong></p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[V(s_t) = \mathbb{E}_{a \sim \pi} [Q(s_t,a)]\]</div>
|
||||
</li>
|
||||
<li><p class="first"><strong>Calculate Q retrace:</strong></p>
|
||||
<li><p><strong>Calculate Q retrace:</strong></p>
|
||||
<blockquote>
|
||||
<div><div class="math notranslate nohighlight">
|
||||
\[Q^{ret}(s_t,a_t) = r_t +\gamma \bar{\rho}_{t+1}[Q^{ret}(s_{t+1},a_{t+1}) - Q(s_{t+1},a_{t+1})] + \gamma V(s_{t+1})\]</div>
|
||||
@@ -248,7 +251,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
|
||||
\[\text{where} \quad \bar{\rho}_{t} = \min{\left\{c,\rho_t\right\}},\quad \rho_t=\frac{\pi (a_t \mid s_t)}{\mu (a_t \mid s_t)}\]</div>
|
||||
</div></blockquote>
|
||||
</li>
|
||||
<li><p class="first"><strong>Accumulate gradients:</strong></p>
|
||||
<li><p><strong>Accumulate gradients:</strong></p>
|
||||
<blockquote>
|
||||
<div><p><span class="math notranslate nohighlight">\(\bullet\)</span> <strong>Policy gradients (with bias correction):</strong></p>
|
||||
<blockquote>
|
||||
@@ -263,7 +266,7 @@ and <span class="math notranslate nohighlight">\(n\)</span> (replay ratio) off-p
|
||||
</div></blockquote>
|
||||
</div></blockquote>
|
||||
</li>
|
||||
<li><p class="first"><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
|
||||
<li><p><strong>(Optional) Trust region update:</strong> change the policy loss gradient w.r.t network output:</p>
|
||||
<blockquote>
|
||||
<div><div class="math notranslate nohighlight">
|
||||
\[\hat{g}_t^{trust-region} = \hat{g}_t^{policy} - \max \left\{0, \frac{k^T \hat{g}_t^{policy} - \delta}{\lVert k \rVert_2^2}\right\} k\]</div>
|
||||
@@ -277,39 +280,35 @@ The goal of the trust region update is to the difference between the updated pol
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.acer_agent.ACERAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.acer_agent.</code><code class="descname">ACERAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/acer_agent.html#ACERAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.acer_agent.ACERAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</li>
|
||||
<li><strong>ratio_of_replay</strong> – (int)
|
||||
The number of off-policy training iterations in each ACER iteration.</li>
|
||||
<li><strong>num_transitions_to_start_replay</strong> – (int)
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.</p></li>
|
||||
<li><p><strong>ratio_of_replay</strong> – (int)
|
||||
The number of off-policy training iterations in each ACER iteration.</p></li>
|
||||
<li><p><strong>num_transitions_to_start_replay</strong> – (int)
|
||||
Number of environment steps until ACER starts to train off-policy from the experience replay.
|
||||
This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
|
||||
the experience replay to start the off-policy training.</li>
|
||||
<li><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
the experience replay to start the off-policy training.</p></li>
|
||||
<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
The rate of the exponential moving average for the average policy which is used for the trust region optimization.
|
||||
The target network in this algorithm is used as the average policy.</li>
|
||||
<li><strong>importance_weight_truncation</strong> – (float)
|
||||
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</li>
|
||||
<li><strong>use_trust_region_optimization</strong> – (bool)
|
||||
The target network in this algorithm is used as the average policy.</p></li>
|
||||
<li><p><strong>importance_weight_truncation</strong> – (float)
|
||||
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).</p></li>
|
||||
<li><p><strong>use_trust_region_optimization</strong> – (bool)
|
||||
If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
|
||||
the average policy and the current one, to bound the change of the policy during the network update.</li>
|
||||
<li><strong>max_KL_divergence</strong> – (float)
|
||||
the average policy and the current one, to bound the change of the policy during the network update.</p></li>
|
||||
<li><p><strong>max_KL_divergence</strong> – (float)
|
||||
The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
|
||||
for this parameter to have an effect.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
for this parameter to have an effect.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the beta value defined by beta_entropy.</li>
|
||||
is weighted using the beta value defined by beta_entropy.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -327,7 +326,7 @@ is weighted using the beta value defined by beta_entropy.</li>
|
||||
<a href="../imitation/bc.html" class="btn btn-neutral float-right" title="Behavioral Cloning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="ac.html" class="btn btn-neutral" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="ac.html" class="btn btn-neutral float-left" title="Actor-Critic" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -336,7 +335,7 @@ is weighted using the beta value defined by beta_entropy.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -353,27 +352,16 @@ is weighted using the beta value defined by beta_entropy.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Clipped Proximal Policy Optimization — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Clipped Proximal Policy Optimization — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Conditional Imitation Learning" href="../imitation/cil.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -233,17 +236,14 @@
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>Very similar to PPO, with several small (but very simplifying) changes:</p>
|
||||
<ol class="arabic">
|
||||
<li><p class="first">Train both the value and policy networks, simultaneously, by defining a single loss function,
|
||||
which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p>
|
||||
</li>
|
||||
<li><p class="first">The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p>
|
||||
</li>
|
||||
<li><p class="first">Value targets are now also calculated based on the GAE advantages.
|
||||
<li><p>Train both the value and policy networks, simultaneously, by defining a single loss function,
|
||||
which is the sum of each of the networks loss functions. Then, back propagate gradients only once from this unified loss function.</p></li>
|
||||
<li><p>The unified network’s optimizer is set to Adam (instead of L-BFGS for the value network as in PPO).</p></li>
|
||||
<li><p>Value targets are now also calculated based on the GAE advantages.
|
||||
In this method, the <span class="math notranslate nohighlight">\(V\)</span> values are predicted from the critic network, and then added to the GAE based advantages,
|
||||
in order to get a <span class="math notranslate nohighlight">\(Q\)</span> value for each action. Now, since our critic network is predicting a <span class="math notranslate nohighlight">\(V\)</span> value for
|
||||
each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p>
|
||||
</li>
|
||||
<li><p class="first">Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
|
||||
each state, setting the <span class="math notranslate nohighlight">\(Q\)</span> calculated action-values as a target, will on average serve as a <span class="math notranslate nohighlight">\(V\)</span> state-value target.</p></li>
|
||||
<li><p>Instead of adapting the penalizing KL divergence coefficient used in PPO, the likelihood ratio
|
||||
<span class="math notranslate nohighlight">\(r_t(\theta) =\frac{\pi_{\theta}(a|s)}{\pi_{\theta_{old}}(a|s)}\)</span> is clipped, to achieve a similar effect.
|
||||
This is done by defining the policy’s loss function to be the minimum between the standard surrogate loss and an epsilon
|
||||
clipped surrogate loss:</p>
|
||||
@@ -253,46 +253,42 @@ clipped surrogate loss:</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.clipped_ppo_agent.</code><code class="descname">ClippedPPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/clipped_ppo_agent.html#ClippedPPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</li>
|
||||
<li><strong>gae_lambda</strong> – (float)
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
|
||||
<li><p><strong>gae_lambda</strong> – (float)
|
||||
The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.</li>
|
||||
<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
n-step estimations.</p></li>
|
||||
<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.</li>
|
||||
<li><strong>value_targets_mix_fraction</strong> – (float)
|
||||
implementations.</p></li>
|
||||
<li><p><strong>value_targets_mix_fraction</strong> – (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
|
||||
<li><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</li>
|
||||
<li><strong>use_kl_regularization</strong> – (bool)
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
|
||||
<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</p></li>
|
||||
<li><p><strong>use_kl_regularization</strong> – (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
policy, to bound the change of the policy during the network update.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
|
||||
<li><strong>optimization_epochs</strong> – (int)
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
|
||||
<li><p><strong>optimization_epochs</strong> – (int)
|
||||
For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
|
||||
optimization_epochs value.</li>
|
||||
<li><strong>optimization_epochs</strong> – (Schedule)
|
||||
Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
optimization_epochs value.</p></li>
|
||||
<li><p><strong>optimization_epochs</strong> – (Schedule)
|
||||
Can be used to define a schedule over the clipping of the likelihood ratio.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -310,7 +306,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
<a href="ddpg.html" class="btn btn-neutral float-right" title="Deep Deterministic Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../imitation/cil.html" class="btn btn-neutral" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../imitation/cil.html" class="btn btn-neutral float-left" title="Conditional Imitation Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -319,7 +315,7 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -336,27 +332,16 @@ Can be used to define a schedule over the clipping of the likelihood ratio.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Deep Deterministic Policy Gradient — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Deep Deterministic Policy Gradient — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Clipped Proximal Policy Optimization" href="cppo.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -235,14 +238,14 @@ to add exploration noise to the action. When testing, use the mean vector <span
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>Start by sampling a batch of transitions from the experience replay.</p>
|
||||
<ul>
|
||||
<li><p class="first">To train the <strong>critic network</strong>, use the following targets:</p>
|
||||
<li><p>To train the <strong>critic network</strong>, use the following targets:</p>
|
||||
<p><span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))\)</span></p>
|
||||
<p>First run the actor target network, using the next states as the inputs, and get <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>.
|
||||
Next, run the critic target network using the next states and <span class="math notranslate nohighlight">\(\mu (s_{t+1} )\)</span>, and use the output to
|
||||
calculate <span class="math notranslate nohighlight">\(y_t\)</span> according to the equation above. To train the network, use the current states and actions
|
||||
as the inputs, and <span class="math notranslate nohighlight">\(y_t\)</span> as the targets.</p>
|
||||
</li>
|
||||
<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<p><span class="math notranslate nohighlight">\(\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]\)</span></p>
|
||||
<p>Use the actor’s online network to get the action mean values using the current states as the inputs.
|
||||
Then, use the critic online network in order to get the gradients of the critic output with respect to the
|
||||
@@ -255,35 +258,31 @@ given <span class="math notranslate nohighlight">\(\nabla_a Q(s,a)\)</span>. Fin
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.ddpg_agent.</code><code class="descname">DDPGAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ddpg_agent.html#DDPGAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</li>
|
||||
<li><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</p></li>
|
||||
<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target</li>
|
||||
<li><strong>num_consecutive_playing_steps</strong> – (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations</li>
|
||||
<li><strong>use_target_network_for_evaluation</strong> – (bool)
|
||||
weight the new online network weights by rate_for_copying_weights_to_target</p></li>
|
||||
<li><p><strong>num_consecutive_playing_steps</strong> – (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations</p></li>
|
||||
<li><p><strong>use_target_network_for_evaluation</strong> – (bool)
|
||||
If set to True, the target network will be used for predicting the actions when choosing actions to act.
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.</li>
|
||||
<li><strong>action_penalty</strong> – (float)
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.</p></li>
|
||||
<li><p><strong>action_penalty</strong> – (float)
|
||||
The amount by which to penalize the network on high action feature (pre-activation) values.
|
||||
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
|
||||
gradients from becoming very low.</li>
|
||||
<li><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.</li>
|
||||
<li><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
|
||||
gradients from becoming very low.</p></li>
|
||||
<li><p><strong>clip_critic_targets</strong> – (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.</p></li>
|
||||
<li><p><strong>use_non_zero_discount_for_terminal_states</strong> – (bool)
|
||||
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.</li>
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -301,7 +300,7 @@ values. If set to False, the terminal states reward will be taken as the target
|
||||
<a href="sac.html" class="btn btn-neutral float-right" title="Soft Actor-Critic" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="cppo.html" class="btn btn-neutral" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="cppo.html" class="btn btn-neutral float-left" title="Clipped Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -310,7 +309,7 @@ values. If set to False, the terminal states reward will be taken as the target
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -327,27 +326,16 @@ values. If set to False, the terminal states reward will be taken as the target
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Hierarchical Actor Critic — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Hierarchical Actor Critic — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -31,21 +39,16 @@
|
||||
<link rel="search" title="Search" href="../../../search.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -212,7 +215,7 @@ to add exploration noise to the action. When testing, use the mean vector <span
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -229,27 +232,16 @@ to add exploration noise to the action. When testing, use the mean vector <span
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Policy Gradient — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Policy Gradient — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Persistent Advantage Learning" href="../value_optimization/pal.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -237,11 +240,11 @@ The <code class="code docutils literal notranslate"><span class="pre">PolicyGrad
|
||||
This is done in order to reduce the variance of the updates, since noisy gradient updates might destabilize the policy’s
|
||||
convergence. The rescaler is a configurable parameter and there are few options to choose from:</p>
|
||||
<ul class="simple">
|
||||
<li><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</li>
|
||||
<li><strong>Future Return</strong> - Return from each transition until the end of the episode.</li>
|
||||
<li><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</li>
|
||||
<li><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
|
||||
which are calculated seperately for each timestep, across different episodes.</li>
|
||||
<li><p><strong>Total Episode Return</strong> - The sum of all the discounted rewards during the episode.</p></li>
|
||||
<li><p><strong>Future Return</strong> - Return from each transition until the end of the episode.</p></li>
|
||||
<li><p><strong>Future Return Normalized by Episode</strong> - Future returns across the episode normalized by the episode’s mean and standard deviation.</p></li>
|
||||
<li><p><strong>Future Return Normalized by Timestep</strong> - Future returns normalized using running means and standard deviations,
|
||||
which are calculated seperately for each timestep, across different episodes.</p></li>
|
||||
</ul>
|
||||
<p>Gradients are accumulated over a number of full played episodes. The gradients accumulation over several episodes
|
||||
serves the same purpose - reducing the update variance. After accumulating gradients for several episodes,
|
||||
@@ -249,32 +252,28 @@ the gradients are then applied to the network.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.policy_gradients_agent.</code><code class="descname">PolicyGradientAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/policy_gradients_agent.html#PolicyGradientAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.policy_gradients_agent.PolicyGradientAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
|
||||
the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
|
||||
return, but there are other rescalers that are intended for reducing the variance of the updates.</li>
|
||||
<li><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
return, but there are other rescalers that are intended for reducing the variance of the updates.</p></li>
|
||||
<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
apply_gradients_every_x_episodes episodes.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
|
||||
will be added to the loss and scaled by the given beta factor.</li>
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
will be added to the loss and scaled by the given beta factor.</p></li>
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.</li>
|
||||
are used in the batch.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -292,7 +291,7 @@ are used in the batch.</li>
|
||||
<a href="ppo.html" class="btn btn-neutral float-right" title="Proximal Policy Optimization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../value_optimization/pal.html" class="btn btn-neutral" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../value_optimization/pal.html" class="btn btn-neutral float-left" title="Persistent Advantage Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -301,7 +300,7 @@ are used in the batch.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -318,27 +317,16 @@ are used in the batch.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Proximal Policy Optimization — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Proximal Policy Optimization — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Policy Gradient" href="pg.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -234,66 +237,62 @@ When testing, just take the mean values predicted by the network.</p>
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</li>
|
||||
<li>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</li>
|
||||
<li>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
|
||||
<li><p>Collect a big chunk of experience (in the order of thousands of transitions, sampled from multiple episodes).</p></li>
|
||||
<li><p>Calculate the advantages for each transition, using the <em>Generalized Advantage Estimation</em> method (Schulman ‘2015).</p></li>
|
||||
<li><p>Run a single training iteration of the value network using an L-BFGS optimizer. Unlike first order optimizers,
|
||||
the L-BFGS optimizer runs on the entire dataset at once, without batching.
|
||||
It continues running until some low loss threshold is reached. To prevent overfitting to the current dataset,
|
||||
the value targets are updated in a soft manner, using an Exponentially Weighted Moving Average, based on the total
|
||||
discounted returns of each state in each episode.</li>
|
||||
<li>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
|
||||
discounted returns of each state in each episode.</p></li>
|
||||
<li><p>Run several training iterations of the policy network. This is done by using the previously calculated advantages as
|
||||
targets. The loss function penalizes policies that deviate too far from the old policy (the policy that was used <em>before</em>
|
||||
starting to run the current set of training iterations) using a regularization term.</li>
|
||||
<li>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
|
||||
starting to run the current set of training iterations) using a regularization term.</p></li>
|
||||
<li><p>After training is done, the last sampled KL divergence value will be compared with the <em>target KL divergence</em> value,
|
||||
in order to adapt the penalty coefficient used in the policy loss. If the KL divergence went too high,
|
||||
increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</li>
|
||||
increase the penalty, if it went too low, reduce it. Otherwise, leave it unchanged.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.ppo_agent.PPOAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.ppo_agent.</code><code class="descname">PPOAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/ppo_agent.html#PPOAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.ppo_agent.PPOAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>policy_gradient_rescaler</strong> – (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</li>
|
||||
<li><strong>gae_lambda</strong> – (float)
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.</p></li>
|
||||
<li><p><strong>gae_lambda</strong> – (float)
|
||||
The <span class="math notranslate nohighlight">\(\lambda\)</span> value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.</li>
|
||||
<li><strong>target_kl_divergence</strong> – (float)
|
||||
n-step estimations.</p></li>
|
||||
<li><p><strong>target_kl_divergence</strong> – (float)
|
||||
The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
|
||||
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</li>
|
||||
<li><strong>initial_kl_coefficient</strong> – (float)
|
||||
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.</p></li>
|
||||
<li><p><strong>initial_kl_coefficient</strong> – (float)
|
||||
The initial weight that will be given to the KL divergence between the current and the new policy in the
|
||||
regularization factor.</li>
|
||||
<li><strong>high_kl_penalty_coefficient</strong> – (float)
|
||||
The penalty that will be given for KL divergence values which are highes than what was defined as the target.</li>
|
||||
<li><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
regularization factor.</p></li>
|
||||
<li><p><strong>high_kl_penalty_coefficient</strong> – (float)
|
||||
The penalty that will be given for KL divergence values which are highes than what was defined as the target.</p></li>
|
||||
<li><p><strong>clip_likelihood_ratio_using_epsilon</strong> – (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.</li>
|
||||
<li><strong>value_targets_mix_fraction</strong> – (float)
|
||||
implementations.</p></li>
|
||||
<li><p><strong>value_targets_mix_fraction</strong> – (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</li>
|
||||
<li><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</li>
|
||||
<li><strong>use_kl_regularization</strong> – (bool)
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.</p></li>
|
||||
<li><p><strong>estimate_state_value_using_gae</strong> – (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.</p></li>
|
||||
<li><p><strong>use_kl_regularization</strong> – (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.</li>
|
||||
<li><strong>beta_entropy</strong> – (float)
|
||||
policy, to bound the change of the policy during the network update.</p></li>
|
||||
<li><p><strong>beta_entropy</strong> – (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</li>
|
||||
is weighted using the <span class="math notranslate nohighlight">\(eta\)</span> value defined by beta_entropy.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -311,7 +310,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
|
||||
<a href="../value_optimization/rainbow.html" class="btn btn-neutral float-right" title="Rainbow" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="pg.html" class="btn btn-neutral" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="pg.html" class="btn btn-neutral float-left" title="Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -320,7 +319,7 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -337,27 +336,16 @@ is weighted using the <span class="math notranslate nohighlight">\(eta\)</span>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Soft Actor-Critic — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Soft Actor-Critic — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Deep Deterministic Policy Gradient" href="ddpg.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -235,19 +238,19 @@ by picking the mean value or sample from a gaussian distribution like in trainin
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>Start by sampling a batch <span class="math notranslate nohighlight">\(B\)</span> of transitions from the experience replay.</p>
|
||||
<ul>
|
||||
<li><p class="first">To train the <strong>Q network</strong>, use the following targets:</p>
|
||||
<li><p>To train the <strong>Q network</strong>, use the following targets:</p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[y_t^Q=r(s_t,a_t)+\gamma \cdot V(s_{t+1})\]</div>
|
||||
<p>The state value used in the above target is acquired by running the target state value network.</p>
|
||||
</li>
|
||||
<li><p class="first">To train the <strong>State Value network</strong>, use the following targets:</p>
|
||||
<li><p>To train the <strong>State Value network</strong>, use the following targets:</p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[y_t^V = \min_{i=1,2}Q_i(s_t,\tilde{a}) - log\pi (\tilde{a} \vert s),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
|
||||
<p>The state value network is trained using a sample-based approximation of the connection between and state value and state
|
||||
action values, The actions used for constructing the target are <strong>not</strong> sampled from the replay buffer, but rather sampled
|
||||
from the current policy.</p>
|
||||
</li>
|
||||
<li><p class="first">To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<li><p>To train the <strong>actor network</strong>, use the following equation:</p>
|
||||
<div class="math notranslate nohighlight">
|
||||
\[\nabla_{\theta} J \approx \nabla_{\theta} \frac{1}{\vert B \vert} \sum_{s_t\in B} \left( Q \left(s_t, \tilde{a}_\theta(s_t)\right) - log\pi_{\theta}(\tilde{a}_{\theta}(s_t)\vert s_t) \right),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)\]</div>
|
||||
</li>
|
||||
@@ -256,24 +259,20 @@ from the current policy.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.soft_actor_critic_agent.</code><code class="descname">SoftActorCriticAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/soft_actor_critic_agent.html#SoftActorCriticAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</li>
|
||||
<li><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</p></li>
|
||||
<li><p><strong>rate_for_copying_weights_to_target</strong> – (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</li>
|
||||
<li><strong>use_deterministic_for_evaluation</strong> – (bool)
|
||||
weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)</p></li>
|
||||
<li><p><strong>use_deterministic_for_evaluation</strong> – (bool)
|
||||
If True, during the evaluation phase, action are chosen deterministically according to the policy mean
|
||||
and not sampled from the policy distribution.</li>
|
||||
and not sampled from the policy distribution.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -291,7 +290,7 @@ and not sampled from the policy distribution.</li>
|
||||
<a href="../other/dfp.html" class="btn btn-neutral float-right" title="Direct Future Prediction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="ddpg.html" class="btn btn-neutral" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="ddpg.html" class="btn btn-neutral float-left" title="Deep Deterministic Policy Gradient" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -300,7 +299,7 @@ and not sampled from the policy distribution.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -317,27 +316,16 @@ and not sampled from the policy distribution.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Bootstrapped DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Bootstrapped DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Behavioral Cloning" href="../imitation/bc.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -265,7 +268,7 @@ Then, train the online network according to the calculated targets.</p>
|
||||
<a href="categorical_dqn.html" class="btn btn-neutral float-right" title="Categorical DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../imitation/bc.html" class="btn btn-neutral" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../imitation/bc.html" class="btn btn-neutral float-left" title="Behavioral Cloning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -274,7 +277,7 @@ Then, train the online network according to the calculated targets.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -291,27 +294,16 @@ Then, train the online network according to the calculated targets.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Categorical DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Categorical DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Bootstrapped DQN" href="bs_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,43 +230,36 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic">
|
||||
<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
|
||||
</li>
|
||||
<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
|
||||
<p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
|
||||
<p>where:
|
||||
* <span class="math notranslate nohighlight">\([ \cdot ]\)</span> bounds its argument in the range <span class="math notranslate nohighlight">\([a, b]\)</span>
|
||||
* <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom <span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r+\gamma z_j\)</span></p>
|
||||
</li>
|
||||
<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p>
|
||||
</li>
|
||||
<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
|
||||
</li>
|
||||
<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p></li>
|
||||
<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.categorical_dqn_agent.</code><code class="descname">CategoricalDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/categorical_dqn_agent.html#CategoricalDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>v_min</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>v_min</strong> – (float)
|
||||
The minimal value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</li>
|
||||
<li><strong>v_max</strong> – (float)
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{min}\)</span> in the paper.</p></li>
|
||||
<li><p><strong>v_max</strong> – (float)
|
||||
The maximum value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</li>
|
||||
<li><strong>atoms</strong> – (int)
|
||||
Corresponds to <span class="math notranslate nohighlight">\(v_{max}\)</span> in the paper.</p></li>
|
||||
<li><p><strong>atoms</strong> – (int)
|
||||
The number of atoms that will be used to discretize the range between v_min and v_max.
|
||||
For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
For the C51 algorithm described in the paper, the number of atoms is 51.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -281,7 +277,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
<a href="../imitation/cil.html" class="btn btn-neutral float-right" title="Conditional Imitation Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="bs_dqn.html" class="btn btn-neutral" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="bs_dqn.html" class="btn btn-neutral float-left" title="Bootstrapped DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -290,7 +286,7 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -307,27 +303,16 @@ For the C51 algorithm described in the paper, the number of atoms is 51.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Double DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Double DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Direct Future Prediction" href="../other/dfp.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,17 +230,17 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Using the next states from the sampled batch, run the online network in order to find the <span class="math notranslate nohighlight">\(Q\)</span> maximizing
|
||||
action <span class="math notranslate nohighlight">\(argmax_a Q(s_{t+1},a)\)</span>. For these actions, use the corresponding next states and run the target
|
||||
network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
network to calculate <span class="math notranslate nohighlight">\(Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span>.</p></li>
|
||||
<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
use the current states from the sampled batch, and run the online network to get the current Q values predictions.
|
||||
Set those values as the targets for the actions that were not actually played.</li>
|
||||
<li>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
|
||||
<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
Set those values as the targets for the actions that were not actually played.</p></li>
|
||||
<li><p>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
|
||||
<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
|
||||
<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
</div>
|
||||
</div>
|
||||
@@ -254,7 +257,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
<a href="dqn.html" class="btn btn-neutral float-right" title="Deep Q Networks" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../other/dfp.html" class="btn btn-neutral" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../other/dfp.html" class="btn btn-neutral float-left" title="Direct Future Prediction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -263,7 +266,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -280,27 +283,16 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Deep Q Networks — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Deep Q Networks — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Double DQN" href="double_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,16 +230,16 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
|
||||
the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</li>
|
||||
<li>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Using the next states from the sampled batch, run the target network to calculate the <span class="math notranslate nohighlight">\(Q\)</span> values for each of
|
||||
the actions <span class="math notranslate nohighlight">\(Q(s_{t+1},a)\)</span>, and keep only the maximum value for each state.</p></li>
|
||||
<li><p>In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
|
||||
use the current states from the sampled batch, and run the online network to get the current Q values predictions.
|
||||
Set those values as the targets for the actions that were not actually played.</li>
|
||||
<li>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></li>
|
||||
<li>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
Set those values as the targets for the actions that were not actually played.</p></li>
|
||||
<li><p>For each action that was played, use the following equation for calculating the targets of the network:
|
||||
<span class="math notranslate nohighlight">\(y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})\)</span></p></li>
|
||||
<li><p>Finally, train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
|
||||
<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.dqn_agent.DQNAlgorithmParameters">
|
||||
@@ -258,7 +261,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
<a href="dueling_dqn.html" class="btn btn-neutral float-right" title="Dueling DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="double_dqn.html" class="btn btn-neutral" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="double_dqn.html" class="btn btn-neutral float-left" title="Double DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -267,7 +270,7 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -284,27 +287,16 @@ Set those values as the targets for the actions that were not actually played.</
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Dueling DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Dueling DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Deep Q Networks" href="dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -245,7 +248,7 @@ single action has been taken at this state.</p>
|
||||
<a href="mmc.html" class="btn btn-neutral float-right" title="Mixed Monte Carlo" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="dqn.html" class="btn btn-neutral" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="dqn.html" class="btn btn-neutral float-left" title="Deep Q Networks" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -254,7 +257,7 @@ single action has been taken at this state.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -271,27 +274,16 @@ single action has been taken at this state.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Mixed Monte Carlo — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Mixed Monte Carlo — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Dueling DQN" href="dueling_dqn.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -238,16 +241,13 @@ Once in every few thousand steps, copy the weights from the online network to th
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.mmc_agent.</code><code class="descname">MixedMonteCarloAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/mmc_agent.html#MixedMonteCarloAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
|
||||
the single-step bootstrapped targets.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
the single-step bootstrapped targets.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -265,7 +265,7 @@ the single-step bootstrapped targets.</td>
|
||||
<a href="n_step.html" class="btn btn-neutral float-right" title="N-Step Q Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="dueling_dqn.html" class="btn btn-neutral" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="dueling_dqn.html" class="btn btn-neutral float-left" title="Dueling DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -274,7 +274,7 @@ the single-step bootstrapped targets.</td>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -291,27 +291,16 @@ the single-step bootstrapped targets.</td>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>N-Step Q Learning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>N-Step Q Learning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Mixed Monte Carlo" href="mmc.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -228,43 +231,39 @@
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<p>The <span class="math notranslate nohighlight">\(N\)</span>-step Q learning algorithm works in similar manner to DQN except for the following changes:</p>
|
||||
<ol class="arabic simple">
|
||||
<li>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
|
||||
<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</li>
|
||||
<li>In order to stabilize the learning, multiple workers work together to update the network.
|
||||
This creates the same effect as uncorrelating the samples used for training.</li>
|
||||
<li>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
|
||||
<li><p>No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
|
||||
<span class="math notranslate nohighlight">\(N\)</span> steps using the latest <span class="math notranslate nohighlight">\(N\)</span> steps played by the agent.</p></li>
|
||||
<li><p>In order to stabilize the learning, multiple workers work together to update the network.
|
||||
This creates the same effect as uncorrelating the samples used for training.</p></li>
|
||||
<li><p>Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
|
||||
to form the <span class="math notranslate nohighlight">\(N\)</span>-step Q targets, according to the following equation:
|
||||
<span class="math notranslate nohighlight">\(R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})\)</span>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</li>
|
||||
where <span class="math notranslate nohighlight">\(k\)</span> is <span class="math notranslate nohighlight">\(T_{max} - State\_Index\)</span> for each state in the batch</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.n_step_q_agent.</code><code class="descname">NStepQAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/n_step_q_agent.html#NStepQAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</li>
|
||||
<li><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_steps_between_copying_online_weights_to_target</strong> – (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.</p></li>
|
||||
<li><p><strong>apply_gradients_every_x_episodes</strong> – (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.</li>
|
||||
<li><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
apply_gradients_every_x_episodes episodes.</p></li>
|
||||
<li><p><strong>num_steps_between_gradient_updates</strong> – (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.</li>
|
||||
<li><strong>targets_horizon</strong> – (str)
|
||||
are used in the batch.</p></li>
|
||||
<li><p><strong>targets_horizon</strong> – (str)
|
||||
Should be either ‘N-Step’ or ‘1-Step’, and defines the length for which to bootstrap the network values over.
|
||||
Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
|
||||
please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</li>
|
||||
please refer to the original paper (<a class="reference external" href="https://arxiv.org/abs/1602.01783">https://arxiv.org/abs/1602.01783</a>)</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -282,7 +281,7 @@ please refer to the original paper (<a class="reference external" href="https://
|
||||
<a href="naf.html" class="btn btn-neutral float-right" title="Normalized Advantage Functions" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="mmc.html" class="btn btn-neutral" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="mmc.html" class="btn btn-neutral float-left" title="Mixed Monte Carlo" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -291,7 +290,7 @@ please refer to the original paper (<a class="reference external" href="https://
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -308,27 +307,16 @@ please refer to the original paper (<a class="reference external" href="https://
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Normalized Advantage Functions — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Normalized Advantage Functions — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="N-Step Q Learning" href="n_step.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -258,7 +261,7 @@ After every training step, use a soft update in order to copy the weights from t
|
||||
<a href="nec.html" class="btn btn-neutral float-right" title="Neural Episodic Control" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="n_step.html" class="btn btn-neutral" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="n_step.html" class="btn btn-neutral float-left" title="N-Step Q Learning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -267,7 +270,7 @@ After every training step, use a soft update in order to copy the weights from t
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -284,27 +287,16 @@ After every training step, use a soft update in order to copy the weights from t
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Neural Episodic Control — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Neural Episodic Control — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Normalized Advantage Functions" href="naf.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -229,14 +232,14 @@
|
||||
<div class="section" id="choosing-an-action">
|
||||
<h3>Choosing an action<a class="headerlink" href="#choosing-an-action" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
|
||||
output from the middleware.</li>
|
||||
<li>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
|
||||
<li><p>Use the current state as an input to the online network and extract the state embedding, which is the intermediate
|
||||
output from the middleware.</p></li>
|
||||
<li><p>For each possible action <span class="math notranslate nohighlight">\(a_i\)</span>, run the DND head using the state embedding and the selected action <span class="math notranslate nohighlight">\(a_i\)</span> as inputs.
|
||||
The DND is queried and returns the <span class="math notranslate nohighlight">\(P\)</span> nearest neighbor keys and values. The keys and values are used to calculate
|
||||
and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</li>
|
||||
<li>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</li>
|
||||
<li>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
|
||||
accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</li>
|
||||
and return the action <span class="math notranslate nohighlight">\(Q\)</span> value from the network.</p></li>
|
||||
<li><p>Pass all the <span class="math notranslate nohighlight">\(Q\)</span> values to the exploration policy and choose an action accordingly.</p></li>
|
||||
<li><p>Store the state embeddings and actions taken during the current episode in a small buffer <span class="math notranslate nohighlight">\(B\)</span>, in order to
|
||||
accumulate transitions until it is possible to calculate the total discounted returns over the entire episode.</p></li>
|
||||
</ol>
|
||||
</div>
|
||||
<div class="section" id="finalizing-an-episode">
|
||||
@@ -256,40 +259,36 @@ the network if necessary:
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.nec_agent.NECAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.nec_agent.</code><code class="descname">NECAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/nec_agent.html#NECAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.nec_agent.NECAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>dnd_size</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dnd_size</strong> – (int)
|
||||
Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
|
||||
of transitions that will be stored is dnd_size x num_actions.</li>
|
||||
<li><strong>l2_norm_added_delta</strong> – (float)
|
||||
of transitions that will be stored is dnd_size x num_actions.</p></li>
|
||||
<li><p><strong>l2_norm_added_delta</strong> – (float)
|
||||
A small value that will be added when calculating the weight of each of the DND entries. This follows the
|
||||
<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</li>
|
||||
<li><strong>new_value_shift_coefficient</strong> – (float)
|
||||
<span class="math notranslate nohighlight">\(\delta\)</span> patameter defined in the paper.</p></li>
|
||||
<li><p><strong>new_value_shift_coefficient</strong> – (float)
|
||||
In the case where a ew embedding that was added to the DND was already present, the value that will be stored
|
||||
in the DND is a mix between the existing value and the new value. The mix rate is defined by
|
||||
new_value_shift_coefficient.</li>
|
||||
<li><strong>number_of_knn</strong> – (int)
|
||||
The number of neighbors that will be retrieved for each DND query.</li>
|
||||
<li><strong>DND_key_error_threshold</strong> – (float)
|
||||
new_value_shift_coefficient.</p></li>
|
||||
<li><p><strong>number_of_knn</strong> – (int)
|
||||
The number of neighbors that will be retrieved for each DND query.</p></li>
|
||||
<li><p><strong>DND_key_error_threshold</strong> – (float)
|
||||
When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
|
||||
exists in the DND, since exact matches of embeddings are very rare.</li>
|
||||
<li><strong>propagate_updates_to_DND</strong> – (bool)
|
||||
exists in the DND, since exact matches of embeddings are very rare.</p></li>
|
||||
<li><p><strong>propagate_updates_to_DND</strong> – (bool)
|
||||
If set to True, when the gradients of the network will be calculated, the gradients will also be
|
||||
backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
|
||||
network weights.</li>
|
||||
<li><strong>n_step</strong> – (int)
|
||||
The bootstrap length that will be used when calculating the state values to store in the DND.</li>
|
||||
<li><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
|
||||
network weights.</p></li>
|
||||
<li><p><strong>n_step</strong> – (int)
|
||||
The bootstrap length that will be used when calculating the state values to store in the DND.</p></li>
|
||||
<li><p><strong>bootstrap_total_return_from_old_policy</strong> – (bool)
|
||||
If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
|
||||
when the state was first seen, and not the latest, most up-to-date network value.</li>
|
||||
when the state was first seen, and not the latest, most up-to-date network value.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -307,7 +306,7 @@ when the state was first seen, and not the latest, most up-to-date network value
|
||||
<a href="pal.html" class="btn btn-neutral float-right" title="Persistent Advantage Learning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="naf.html" class="btn btn-neutral" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="naf.html" class="btn btn-neutral float-left" title="Normalized Advantage Functions" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -316,7 +315,7 @@ when the state was first seen, and not the latest, most up-to-date network value
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -333,27 +332,16 @@ when the state was first seen, and not the latest, most up-to-date network value
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Persistent Advantage Learning — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Persistent Advantage Learning — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Neural Episodic Control" href="nec.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,47 +230,43 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>Start by calculating the initial target values in the same manner as they are calculated in DDQN
|
||||
<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></li>
|
||||
<li>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>Start by calculating the initial target values in the same manner as they are calculated in DDQN
|
||||
<span class="math notranslate nohighlight">\(y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))\)</span></p></li>
|
||||
<li><p>The action gap <span class="math notranslate nohighlight">\(V(s_t )-Q(s_t,a_t)\)</span> should then be subtracted from each of the calculated targets.
|
||||
To calculate the action gap, run the target network using the current states and get the <span class="math notranslate nohighlight">\(Q\)</span> values
|
||||
for all the actions. Then estimate <span class="math notranslate nohighlight">\(V\)</span> as the maximum predicted <span class="math notranslate nohighlight">\(Q\)</span> value for the current state:
|
||||
<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></li>
|
||||
<li>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
|
||||
<span class="math notranslate nohighlight">\(V(s_t )=max_a Q(s_t,a)\)</span></p></li>
|
||||
<li><p>For <em>advantage learning (AL)</em>, reduce the action gap weighted by a predefined parameter <span class="math notranslate nohighlight">\(\alpha\)</span> from
|
||||
the targets <span class="math notranslate nohighlight">\(y_t^{DDQN}\)</span>:
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></li>
|
||||
<li>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot (V(s_t )-Q(s_t,a_t ))\)</span></p></li>
|
||||
<li><p>For <em>persistent advantage learning (PAL)</em>, the target network is also used in order to calculate the action
|
||||
gap for the next state:
|
||||
<span class="math notranslate nohighlight">\(V(s_{t+1} )-Q(s_{t+1},a_{t+1})\)</span>
|
||||
where <span class="math notranslate nohighlight">\(a_{t+1}\)</span> is chosen by running the next states through the online network and choosing the action that
|
||||
has the highest predicted <span class="math notranslate nohighlight">\(Q\)</span> value. Finally, the targets will be defined as -
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></li>
|
||||
<li>Train the online network using the current states as inputs, and with the aforementioned targets.</li>
|
||||
<li>Once in every few thousand steps, copy the weights from the online network to the target network.</li>
|
||||
<span class="math notranslate nohighlight">\(y_t=y_t^{DDQN}-\alpha \cdot min(V(s_t )-Q(s_t,a_t ),V(s_{t+1} )-Q(s_{t+1},a_{t+1} ))\)</span></p></li>
|
||||
<li><p>Train the online network using the current states as inputs, and with the aforementioned targets.</p></li>
|
||||
<li><p>Once in every few thousand steps, copy the weights from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.pal_agent.PALAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.pal_agent.</code><code class="descname">PALAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/pal_agent.html#PALAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.pal_agent.PALAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>pal_alpha</strong> – (float)
|
||||
A factor that weights the amount by which the advantage learning update will be taken into account.</li>
|
||||
<li><strong>persistent_advantage_learning</strong> – (bool)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>pal_alpha</strong> – (float)
|
||||
A factor that weights the amount by which the advantage learning update will be taken into account.</p></li>
|
||||
<li><p><strong>persistent_advantage_learning</strong> – (bool)
|
||||
If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
|
||||
the same actions one after the other instead of changing actions.</li>
|
||||
<li><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
the same actions one after the other instead of changing actions.</p></li>
|
||||
<li><p><strong>monte_carlo_mixing_rate</strong> – (float)
|
||||
The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
|
||||
total discounted returns, and they can help reduce the time it takes for the network to update to the newly
|
||||
seen values, since it is not based on bootstrapping the current network values.</li>
|
||||
seen values, since it is not based on bootstrapping the current network values.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -285,7 +284,7 @@ seen values, since it is not based on bootstrapping the current network values.<
|
||||
<a href="../policy_optimization/pg.html" class="btn btn-neutral float-right" title="Policy Gradient" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="nec.html" class="btn btn-neutral" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="nec.html" class="btn btn-neutral float-left" title="Neural Episodic Control" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -294,7 +293,7 @@ seen values, since it is not based on bootstrapping the current network values.<
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -311,27 +310,16 @@ seen values, since it is not based on bootstrapping the current network values.<
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Quantile Regression DQN — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Quantile Regression DQN — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Rainbow" href="rainbow.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -227,33 +230,29 @@
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic simple">
|
||||
<li>Sample a batch of transitions from the replay buffer.</li>
|
||||
<li>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
|
||||
by following the Bellman equation.
|
||||
Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
|
||||
quantile midpoints targets.</li>
|
||||
<li>The network is trained with the quantile regression loss between the resulting quantile locations and the target
|
||||
quantile locations. Only the targets of the actions that were actually taken are updated.</li>
|
||||
<li>Once in every few thousand steps, weights are copied from the online network to the target network.</li>
|
||||
quantile midpoints targets.</p></li>
|
||||
<li><p>The network is trained with the quantile regression loss between the resulting quantile locations and the target
|
||||
quantile locations. Only the targets of the actions that were actually taken are updated.</p></li>
|
||||
<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.qr_dqn_agent.</code><code class="descname">QuantileRegressionDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/qr_dqn_agent.html#QuantileRegressionDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>atoms</strong> – (int)
|
||||
the number of atoms to predict for each action</li>
|
||||
<li><strong>huber_loss_interval</strong> – (float)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>atoms</strong> – (int)
|
||||
the number of atoms to predict for each action</p></li>
|
||||
<li><p><strong>huber_loss_interval</strong> – (float)
|
||||
One of the huber loss parameters, and is referred to as <span class="math notranslate nohighlight">\(\kapa\)</span> in the paper.
|
||||
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</li>
|
||||
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -271,7 +270,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
|
||||
<a href="../../architectures/index.html" class="btn btn-neutral float-right" title="Architectures" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="rainbow.html" class="btn btn-neutral" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="rainbow.html" class="btn btn-neutral float-left" title="Rainbow" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -280,7 +279,7 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -297,27 +296,16 @@ It describes the interval [-k, k] in which the huber loss acts as a MSE loss.</l
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Rainbow — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Rainbow — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Proximal Policy Optimization" href="../policy_optimization/ppo.html" />
|
||||
<link href="../../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -226,19 +229,18 @@
|
||||
<h2>Algorithm Description<a class="headerlink" href="#algorithm-description" title="Permalink to this headline">¶</a></h2>
|
||||
<p>Rainbow combines 6 recent advancements in reinforcement learning:</p>
|
||||
<ul class="simple">
|
||||
<li>N-step returns</li>
|
||||
<li>Distributional state-action value learning</li>
|
||||
<li>Dueling networks</li>
|
||||
<li>Noisy Networks</li>
|
||||
<li>Double DQN</li>
|
||||
<li>Prioritized Experience Replay</li>
|
||||
<li><p>N-step returns</p></li>
|
||||
<li><p>Distributional state-action value learning</p></li>
|
||||
<li><p>Dueling networks</p></li>
|
||||
<li><p>Noisy Networks</p></li>
|
||||
<li><p>Double DQN</p></li>
|
||||
<li><p>Prioritized Experience Replay</p></li>
|
||||
</ul>
|
||||
<div class="section" id="training-the-network">
|
||||
<h3>Training the network<a class="headerlink" href="#training-the-network" title="Permalink to this headline">¶</a></h3>
|
||||
<ol class="arabic">
|
||||
<li><p class="first">Sample a batch of transitions from the replay buffer.</p>
|
||||
</li>
|
||||
<li><p class="first">The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
<li><p>Sample a batch of transitions from the replay buffer.</p></li>
|
||||
<li><p>The Bellman update is projected to the set of atoms representing the <span class="math notranslate nohighlight">\(Q\)</span> values distribution, such
|
||||
that the <span class="math notranslate nohighlight">\(i-th\)</span> component of the projected update is calculated as follows:</p>
|
||||
<p><span class="math notranslate nohighlight">\((\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))\)</span></p>
|
||||
<p>where:
|
||||
@@ -246,36 +248,29 @@ that the <span class="math notranslate nohighlight">\(i-th\)</span> component of
|
||||
* <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}}\)</span> is the Bellman update for atom
|
||||
<span class="math notranslate nohighlight">\(z_j\)</span>: <span class="math notranslate nohighlight">\(\hat{T}_{z_{j}} := r_t+\gamma r_{t+1} + ... + \gamma r_{t+n-1} + \gamma^{n-1} z_j\)</span></p>
|
||||
</li>
|
||||
<li><p class="first">Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p>
|
||||
</li>
|
||||
<li><p class="first">Once in every few thousand steps, weights are copied from the online network to the target network.</p>
|
||||
</li>
|
||||
<li><p class="first">After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
|
||||
using the KL divergence loss that is returned from the network.</p>
|
||||
</li>
|
||||
<li><p>Network is trained with the cross entropy loss between the resulting probability distribution and the target
|
||||
probability distribution. Only the target of the actions that were actually taken is updated.</p></li>
|
||||
<li><p>Once in every few thousand steps, weights are copied from the online network to the target network.</p></li>
|
||||
<li><p>After every training step, the priorities of the batch transitions are updated in the prioritized replay buffer
|
||||
using the KL divergence loss that is returned from the network.</p></li>
|
||||
</ol>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.agents.rainbow_dqn_agent.</code><code class="descname">RainbowDQNAlgorithmParameters</code><a class="reference internal" href="../../../_modules/rl_coach/agents/rainbow_dqn_agent.html#RainbowDQNAlgorithmParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>n_step</strong> – (int)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>n_step</strong> – (int)
|
||||
The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
|
||||
using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
|
||||
prediction.</li>
|
||||
<li><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
|
||||
prediction.</p></li>
|
||||
<li><p><strong>store_transitions_only_when_episodes_are_terminated</strong> – (bool)
|
||||
If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
|
||||
written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
|
||||
transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
transitions into the memory, and to do so we need the entire episode first.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -293,7 +288,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
<a href="qr_dqn.html" class="btn btn-neutral float-right" title="Quantile Regression DQN" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../policy_optimization/ppo.html" class="btn btn-neutral" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../policy_optimization/ppo.html" class="btn btn-neutral float-left" title="Proximal Policy Optimization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -302,7 +297,7 @@ transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -319,27 +314,16 @@ transitions into the memory, and to do so we need the entire episode first.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Architectures — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Architectures — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Quantile Regression DQN" href="../agents/value_optimization/qr_dqn.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -194,75 +197,71 @@ parts that are implemented using TensorFlow.</p>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.base_parameters.NetworkParameters">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.base_parameters.</code><code class="descname">NetworkParameters</code><span class="sig-paren">(</span><em>force_cpu=False</em>, <em>async_training=False</em>, <em>shared_optimizer=True</em>, <em>scale_down_gradients_by_number_of_workers_for_sync_training=True</em>, <em>clip_gradients=None</em>, <em>gradients_clipping_method=<GradientClippingMethod.ClipByGlobalNorm: 0></em>, <em>l2_regularization=0</em>, <em>learning_rate=0.00025</em>, <em>learning_rate_decay_rate=0</em>, <em>learning_rate_decay_steps=0</em>, <em>input_embedders_parameters={}</em>, <em>embedding_merger_type=<EmbeddingMergerType.Concat: 0></em>, <em>middleware_parameters=None</em>, <em>heads_parameters=[]</em>, <em>use_separate_networks_per_head=False</em>, <em>optimizer_type='Adam'</em>, <em>optimizer_epsilon=0.0001</em>, <em>adam_optimizer_beta1=0.9</em>, <em>adam_optimizer_beta2=0.99</em>, <em>rms_prop_optimizer_decay=0.9</em>, <em>batch_size=32</em>, <em>replace_mse_with_huber_loss=False</em>, <em>create_target_network=False</em>, <em>tensorflow_support=True</em>, <em>softmax_temperature=1</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/base_parameters.html#NetworkParameters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.base_parameters.NetworkParameters" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>force_cpu</strong> – Force the neural networks to run on the CPU even if a GPU is available</li>
|
||||
<li><strong>async_training</strong> – If set to True, asynchronous training will be used, meaning that each workers will progress in its own
|
||||
speed, while not waiting for the rest of the workers to calculate their gradients.</li>
|
||||
<li><strong>shared_optimizer</strong> – If set to True, a central optimizer which will be shared with all the workers will be used for applying
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>force_cpu</strong> – Force the neural networks to run on the CPU even if a GPU is available</p></li>
|
||||
<li><p><strong>async_training</strong> – If set to True, asynchronous training will be used, meaning that each workers will progress in its own
|
||||
speed, while not waiting for the rest of the workers to calculate their gradients.</p></li>
|
||||
<li><p><strong>shared_optimizer</strong> – If set to True, a central optimizer which will be shared with all the workers will be used for applying
|
||||
gradients to the network. Otherwise, each worker will have its own optimizer with its own internal
|
||||
parameters that will only be affected by the gradients calculated by that worker</li>
|
||||
<li><strong>scale_down_gradients_by_number_of_workers_for_sync_training</strong> – If set to True, in synchronous training, the gradients of each worker will be scaled down by the
|
||||
parameters that will only be affected by the gradients calculated by that worker</p></li>
|
||||
<li><p><strong>scale_down_gradients_by_number_of_workers_for_sync_training</strong> – If set to True, in synchronous training, the gradients of each worker will be scaled down by the
|
||||
number of workers. This essentially means that the gradients applied to the network are the average
|
||||
of the gradients over all the workers.</li>
|
||||
<li><strong>clip_gradients</strong> – A value that will be used for clipping the gradients of the network. If set to None, no gradient clipping
|
||||
will be applied. Otherwise, the gradients will be clipped according to the gradients_clipping_method.</li>
|
||||
<li><strong>gradients_clipping_method</strong> – A gradient clipping method, defined by a GradientClippingMethod enum, and that will be used to clip the
|
||||
of the gradients over all the workers.</p></li>
|
||||
<li><p><strong>clip_gradients</strong> – A value that will be used for clipping the gradients of the network. If set to None, no gradient clipping
|
||||
will be applied. Otherwise, the gradients will be clipped according to the gradients_clipping_method.</p></li>
|
||||
<li><p><strong>gradients_clipping_method</strong> – A gradient clipping method, defined by a GradientClippingMethod enum, and that will be used to clip the
|
||||
gradients of the network. This will only be used if the clip_gradients value is defined as a value other
|
||||
than None.</li>
|
||||
<li><strong>l2_regularization</strong> – A L2 regularization weight that will be applied to the network weights while calculating the loss function</li>
|
||||
<li><strong>learning_rate</strong> – The learning rate for the network</li>
|
||||
<li><strong>learning_rate_decay_rate</strong> – If this value is larger than 0, an exponential decay will be applied to the network learning rate.
|
||||
than None.</p></li>
|
||||
<li><p><strong>l2_regularization</strong> – A L2 regularization weight that will be applied to the network weights while calculating the loss function</p></li>
|
||||
<li><p><strong>learning_rate</strong> – The learning rate for the network</p></li>
|
||||
<li><p><strong>learning_rate_decay_rate</strong> – If this value is larger than 0, an exponential decay will be applied to the network learning rate.
|
||||
The rate of the decay is defined by this parameter, and the number of training steps the decay will be
|
||||
applied is defined by learning_rate_decay_steps. Notice that both parameters should be defined in order
|
||||
for this to work correctly.</li>
|
||||
<li><strong>learning_rate_decay_steps</strong> – If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
|
||||
for this to work correctly.</p></li>
|
||||
<li><p><strong>learning_rate_decay_steps</strong> – If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
|
||||
the network learning rate. The number of steps the decay will be applied is defined by this parameter.
|
||||
Notice that both this parameter, as well as learning_rate_decay_rate should be defined in order for the
|
||||
learning rate decay to work correctly.</li>
|
||||
<li><strong>input_embedders_parameters</strong> – A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
|
||||
learning rate decay to work correctly.</p></li>
|
||||
<li><p><strong>input_embedders_parameters</strong> – A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
|
||||
network. Each of the keys is an input name as returned from the environment in the state.
|
||||
For example, if the environment returns a state containing ‘observation’ and ‘measurements’, then
|
||||
the keys for the input embedders dictionary can be either ‘observation’ to use the observation as input,
|
||||
‘measurements’ to use the measurements as input, or both.
|
||||
The embedder type will be automatically selected according to the input type. Vector inputs will
|
||||
produce a fully connected embedder, and image inputs will produce a convolutional embedder.</li>
|
||||
<li><strong>embedding_merger_type</strong> – The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
|
||||
This will be used to merge the outputs of all the input embedders into a single embbeding.</li>
|
||||
<li><strong>middleware_parameters</strong> – The parameters of the middleware to use, given by a MiddlewareParameters object.
|
||||
produce a fully connected embedder, and image inputs will produce a convolutional embedder.</p></li>
|
||||
<li><p><strong>embedding_merger_type</strong> – The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
|
||||
This will be used to merge the outputs of all the input embedders into a single embbeding.</p></li>
|
||||
<li><p><strong>middleware_parameters</strong> – The parameters of the middleware to use, given by a MiddlewareParameters object.
|
||||
Each network will have only a single middleware embedder which will take the merged embeddings from the
|
||||
input embedders and pass them through more neural network layers.</li>
|
||||
<li><strong>heads_parameters</strong> – A list of heads for the network given by their corresponding HeadParameters.
|
||||
input embedders and pass them through more neural network layers.</p></li>
|
||||
<li><p><strong>heads_parameters</strong> – A list of heads for the network given by their corresponding HeadParameters.
|
||||
Each network can have one or multiple network heads, where each one will take the output of the middleware
|
||||
and make some additional computation on top of it. Additionally, each head calculates a weighted loss value,
|
||||
and the loss values from all the heads will be summed later on.</li>
|
||||
<li><strong>use_separate_networks_per_head</strong> – A flag that allows using different copies of the input embedders and middleware for each one of the heads.
|
||||
and the loss values from all the heads will be summed later on.</p></li>
|
||||
<li><p><strong>use_separate_networks_per_head</strong> – A flag that allows using different copies of the input embedders and middleware for each one of the heads.
|
||||
Regularly, the heads will have a shared input, but in the case where use_separate_networks_per_head is set
|
||||
to True, each one of the heads will get a different input.</li>
|
||||
<li><strong>optimizer_type</strong> – A string specifying the optimizer type to use for updating the network. The available optimizers are
|
||||
Adam, RMSProp and LBFGS.</li>
|
||||
<li><strong>optimizer_epsilon</strong> – An internal optimizer parameter used for Adam and RMSProp.</li>
|
||||
<li><strong>adam_optimizer_beta1</strong> – An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
|
||||
optimizer for the network.</li>
|
||||
<li><strong>adam_optimizer_beta2</strong> – An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
|
||||
optimizer for the network.</li>
|
||||
<li><strong>rms_prop_optimizer_decay</strong> – The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
|
||||
selected for this network.</li>
|
||||
<li><strong>batch_size</strong> – The batch size to use when updating the network.</li>
|
||||
<li><strong>replace_mse_with_huber_loss</strong> – </li>
|
||||
<li><strong>create_target_network</strong> – If this flag is set to True, an additional copy of the network will be created and initialized with the
|
||||
to True, each one of the heads will get a different input.</p></li>
|
||||
<li><p><strong>optimizer_type</strong> – A string specifying the optimizer type to use for updating the network. The available optimizers are
|
||||
Adam, RMSProp and LBFGS.</p></li>
|
||||
<li><p><strong>optimizer_epsilon</strong> – An internal optimizer parameter used for Adam and RMSProp.</p></li>
|
||||
<li><p><strong>adam_optimizer_beta1</strong> – An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
|
||||
optimizer for the network.</p></li>
|
||||
<li><p><strong>adam_optimizer_beta2</strong> – An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
|
||||
optimizer for the network.</p></li>
|
||||
<li><p><strong>rms_prop_optimizer_decay</strong> – The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
|
||||
selected for this network.</p></li>
|
||||
<li><p><strong>batch_size</strong> – The batch size to use when updating the network.</p></li>
|
||||
<li><p><strong>replace_mse_with_huber_loss</strong> – </p></li>
|
||||
<li><p><strong>create_target_network</strong> – If this flag is set to True, an additional copy of the network will be created and initialized with the
|
||||
same weights as the online network. It can then be queried, and its weights can be synced from the
|
||||
online network at will.</li>
|
||||
<li><strong>tensorflow_support</strong> – A flag which specifies if the network is supported by the TensorFlow framework.</li>
|
||||
<li><strong>softmax_temperature</strong> – If a softmax is present in the network head output, use this temperature</li>
|
||||
online network at will.</p></li>
|
||||
<li><p><strong>tensorflow_support</strong> – A flag which specifies if the network is supported by the TensorFlow framework.</p></li>
|
||||
<li><p><strong>softmax_temperature</strong> – If a softmax is present in the network head output, use this temperature</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<div class="section" id="architecture">
|
||||
@@ -271,19 +270,15 @@ online network at will.</li>
|
||||
<dt id="rl_coach.architectures.architecture.Architecture">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.architectures.architecture.</code><code class="descname">Architecture</code><span class="sig-paren">(</span><em>agent_parameters: rl_coach.base_parameters.AgentParameters</em>, <em>spaces: rl_coach.spaces.SpacesDefinition</em>, <em>name: str = ''</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Creates a neural network ‘architecture’, that can be trained and used for inference.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>agent_parameters</strong> – the agent parameters</li>
|
||||
<li><strong>spaces</strong> – the spaces (observation, action, etc.) definition of the agent</li>
|
||||
<li><strong>name</strong> – the name of the network</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>agent_parameters</strong> – the agent parameters</p></li>
|
||||
<li><p><strong>spaces</strong> – the spaces (observation, action, etc.) definition of the agent</p></li>
|
||||
<li><p><strong>name</strong> – the name of the network</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.architecture.Architecture.accumulate_gradients">
|
||||
<code class="descname">accumulate_gradients</code><span class="sig-paren">(</span><em>inputs: Dict[str, numpy.ndarray], targets: List[numpy.ndarray], additional_fetches: list = None, importance_weights: numpy.ndarray = None, no_accumulation: bool = False</em><span class="sig-paren">)</span> → Tuple[float, List[float], float, list]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.accumulate_gradients"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.accumulate_gradients" title="Permalink to this definition">¶</a></dt>
|
||||
@@ -292,29 +287,28 @@ gradients for model parameters. Will run forward and backward pass to compute gr
|
||||
values if required and then accumulate gradients from all learners. It does not update the model weights,
|
||||
that’s performed in <cite>apply_and_reset_gradients</cite> method.</p>
|
||||
<p>Once gradients are accumulated, they are accessed by <cite>accumulated_gradients</cite> property of this class.å</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>inputs</strong> – <p>typically the environment states (but can also contain other data for loss)
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>inputs</strong> – <p>typically the environment states (but can also contain other data for loss)
|
||||
(e.g. <cite>{‘observation’: numpy.ndarray}</cite> with <cite>observation</cite> of shape (batch_size, observation_space_size) or</p>
|
||||
<blockquote>
|
||||
<div>(batch_size, observation_space_size, stack_size) or</div></blockquote>
|
||||
<div><p>(batch_size, observation_space_size, stack_size) or</p>
|
||||
</div></blockquote>
|
||||
<p><cite>{‘observation’: numpy.ndarray, ‘output_0_0’: numpy.ndarray}</cite> with <cite>output_0_0</cite> of shape (batch_size,))</p>
|
||||
</li>
|
||||
<li><strong>targets</strong> – targets for calculating loss. For example discounted rewards for value network
|
||||
</p></li>
|
||||
<li><p><strong>targets</strong> – targets for calculating loss. For example discounted rewards for value network
|
||||
for calculating the value-network loss would be a target. Length of list and order of arrays in
|
||||
the list matches that of network losses which are defined by network parameters</li>
|
||||
<li><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
|
||||
element is framework dependent.</li>
|
||||
<li><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</li>
|
||||
<li><strong>no_accumulation</strong> – if True, set gradient values to the new gradients, otherwise sum with previously
|
||||
calculated gradients</li>
|
||||
the list matches that of network losses which are defined by network parameters</p></li>
|
||||
<li><p><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
|
||||
element is framework dependent.</p></li>
|
||||
<li><p><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</p></li>
|
||||
<li><p><strong>no_accumulation</strong> – if True, set gradient values to the new gradients, otherwise sum with previously
|
||||
calculated gradients</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
|
||||
total_loss (float): sum of all head losses
|
||||
losses (list of float): list of all losses. The order is list of target losses followed by list of</p>
|
||||
<blockquote>
|
||||
@@ -324,10 +318,8 @@ losses (list of float): list of all losses. The order is list of target losses f
|
||||
<p>norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
|
||||
fetched_tensors: all values for additional_fetches</p>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -335,19 +327,15 @@ fetched_tensors: all values for additional_fetches</p>
|
||||
<code class="descname">apply_and_reset_gradients</code><span class="sig-paren">(</span><em>gradients: List[numpy.ndarray], scaler: float = 1.0</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.apply_and_reset_gradients"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.apply_and_reset_gradients" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Applies the given gradients to the network weights and resets the gradient accumulations.
|
||||
Has the same impact as calling <cite>apply_gradients</cite>, then <cite>reset_accumulated_gradients</cite>.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
|
||||
of an identical network (either self or another identical network)</li>
|
||||
<li><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
|
||||
of an identical network (either self or another identical network)</p></li>
|
||||
<li><p><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -355,19 +343,15 @@ of an identical network (either self or another identical network)</li>
|
||||
<code class="descname">apply_gradients</code><span class="sig-paren">(</span><em>gradients: List[numpy.ndarray], scaler: float = 1.0</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.apply_gradients"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.apply_gradients" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Applies the given gradients to the network weights.
|
||||
Will be performed sync or async depending on <cite>network_parameters.async_training</cite></p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
|
||||
of an identical network (either self or another identical network)</li>
|
||||
<li><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>gradients</strong> – gradients for the parameter weights, taken from <cite>accumulated_gradients</cite> property
|
||||
of an identical network (either self or another identical network)</p></li>
|
||||
<li><p><strong>scaler</strong> – A scaling factor that allows rescaling the gradients before applying them</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -376,15 +360,13 @@ of an identical network (either self or another identical network)</li>
|
||||
<dd><p>Collection of all savers for the network (typically only one saver for network and one for ONNX export)
|
||||
:param parent_path_suffix: path suffix of the parent of the network</p>
|
||||
<blockquote>
|
||||
<div>(e.g. could be name of level manager plus name of agent)</div></blockquote>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">saver collection for the network</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<div><p>(e.g. could be name of level manager plus name of agent)</p>
|
||||
</div></blockquote>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>saver collection for the network</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="staticmethod">
|
||||
@@ -404,75 +386,62 @@ of an identical network (either self or another identical network)</li>
|
||||
<dd><p>Gets value of a specified variable. Type of variable is dependant on the framework.
|
||||
Example of a variable is head.kl_coefficient, which could be a symbol for evaluation
|
||||
or could be a string representing the value.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>variable</strong> – variable of interest</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">value of the specified variable</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>variable</strong> – variable of interest</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>value of the specified variable</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.architecture.Architecture.get_weights">
|
||||
<code class="descname">get_weights</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → List[numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.get_weights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.get_weights" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Gets model weights as a list of ndarrays. It is used for synchronizing weight between two identical networks.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">list weights as ndarray</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>list weights as ndarray</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="staticmethod">
|
||||
<dt id="rl_coach.architectures.architecture.Architecture.parallel_predict">
|
||||
<em class="property">static </em><code class="descname">parallel_predict</code><span class="sig-paren">(</span><em>sess: Any, network_input_tuples: List[Tuple[Architecture, Dict[str, numpy.ndarray]]]</em><span class="sig-paren">)</span> → Tuple[numpy.ndarray, ...]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.parallel_predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.parallel_predict" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>sess</strong> – active session to use for prediction</li>
|
||||
<li><strong>network_input_tuples</strong> – tuple of network and corresponding input</li>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>sess</strong> – active session to use for prediction</p></li>
|
||||
<li><p><strong>network_input_tuples</strong> – tuple of network and corresponding input</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">list or tuple of outputs from all networks</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>list or tuple of outputs from all networks</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.architecture.Architecture.predict">
|
||||
<code class="descname">predict</code><span class="sig-paren">(</span><em>inputs: Dict[str, numpy.ndarray], outputs: List[Any] = None, squeeze_output: bool = True, initial_feed_dict: Dict[Any, numpy.ndarray] = None</em><span class="sig-paren">)</span> → Tuple[numpy.ndarray, ...]<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.predict"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.predict" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given input observations, use the model to make predictions (e.g. action or value).</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>inputs</strong> – current state (i.e. observations, measurements, goals, etc.)
|
||||
(e.g. <cite>{‘observation’: numpy.ndarray}</cite> of shape (batch_size, observation_space_size))</li>
|
||||
<li><strong>outputs</strong> – list of outputs to return. Return all outputs if unspecified. Type of the list elements
|
||||
depends on the framework backend.</li>
|
||||
<li><strong>squeeze_output</strong> – call squeeze_list on output before returning if True</li>
|
||||
<li><strong>initial_feed_dict</strong> – a dictionary of extra inputs for forward pass.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>inputs</strong> – current state (i.e. observations, measurements, goals, etc.)
|
||||
(e.g. <cite>{‘observation’: numpy.ndarray}</cite> of shape (batch_size, observation_space_size))</p></li>
|
||||
<li><p><strong>outputs</strong> – list of outputs to return. Return all outputs if unspecified. Type of the list elements
|
||||
depends on the framework backend.</p></li>
|
||||
<li><p><strong>squeeze_output</strong> – call squeeze_list on output before returning if True</p></li>
|
||||
<li><p><strong>initial_feed_dict</strong> – a dictionary of extra inputs for forward pass.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">predictions of action or value of shape (batch_size, action_space_size) for action predictions)</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>predictions of action or value of shape (batch_size, action_space_size) for action predictions)</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -490,41 +459,33 @@ which must return a list of numpy ndarrays. Child class must ensure that <cite>a
|
||||
and is a unique identifier for assigning value to a variable. For example an agent may use
|
||||
head.assign_kl_coefficient. There is a one to one mapping between assign_op and placeholder
|
||||
(in the example above, placeholder would be head.kl_coefficient_ph).</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>assign_op</strong> – a parameter representing the operation for assigning value to a specific variable</li>
|
||||
<li><strong>value</strong> – value of the specified variable used for update</li>
|
||||
<li><strong>placeholder</strong> – a placeholder for binding the value to assign_op.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>assign_op</strong> – a parameter representing the operation for assigning value to a specific variable</p></li>
|
||||
<li><p><strong>value</strong> – value of the specified variable used for update</p></li>
|
||||
<li><p><strong>placeholder</strong> – a placeholder for binding the value to assign_op.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.architecture.Architecture.set_weights">
|
||||
<code class="descname">set_weights</code><span class="sig-paren">(</span><em>weights: List[numpy.ndarray], rate: float = 1.0</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/architectures/architecture.html#Architecture.set_weights"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.architecture.Architecture.set_weights" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sets model weights for provided layer parameters.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>weights</strong> – list of model weights in the same order as received in get_weights</li>
|
||||
<li><strong>rate</strong> – controls the mixture of given weight values versus old weight values.
|
||||
i.e. new_weight = rate * given_weight + (1 - rate) * old_weight</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>weights</strong> – list of model weights in the same order as received in get_weights</p></li>
|
||||
<li><p><strong>rate</strong> – controls the mixture of given weight values versus old weight values.
|
||||
i.e. new_weight = rate * given_weight + (1 - rate) * old_weight</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">None</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -535,26 +496,24 @@ forward pass and backward pass of the network, accumulates the gradients and app
|
||||
update the weights.
|
||||
Calls <cite>accumulate_gradients</cite> followed by <cite>apply_and_reset_gradients</cite>.
|
||||
Note: Currently an unused method.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>inputs</strong> – typically the environment states (but can also contain other data necessary for loss).
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>inputs</strong> – typically the environment states (but can also contain other data necessary for loss).
|
||||
(e.g. <cite>{‘observation’: numpy.ndarray}</cite> with <cite>observation</cite> of shape (batch_size, observation_space_size) or
|
||||
(batch_size, observation_space_size, stack_size) or
|
||||
<cite>{‘observation’: numpy.ndarray, ‘output_0_0’: numpy.ndarray}</cite> with <cite>output_0_0</cite> of shape (batch_size,))</li>
|
||||
<li><strong>targets</strong> – target values of shape (batch_size, ). For example discounted rewards for value network
|
||||
<cite>{‘observation’: numpy.ndarray, ‘output_0_0’: numpy.ndarray}</cite> with <cite>output_0_0</cite> of shape (batch_size,))</p></li>
|
||||
<li><p><strong>targets</strong> – target values of shape (batch_size, ). For example discounted rewards for value network
|
||||
for calculating the value-network loss would be a target. Length of list and order of arrays in
|
||||
the list matches that of network losses which are defined by network parameters</li>
|
||||
<li><strong>scaler</strong> – value to scale gradients by before optimizing network weights</li>
|
||||
<li><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
|
||||
element is framework dependent.</li>
|
||||
<li><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</li>
|
||||
the list matches that of network losses which are defined by network parameters</p></li>
|
||||
<li><p><strong>scaler</strong> – value to scale gradients by before optimizing network weights</p></li>
|
||||
<li><p><strong>additional_fetches</strong> – list of additional values to fetch and return. The type of each list
|
||||
element is framework dependent.</p></li>
|
||||
<li><p><strong>importance_weights</strong> – ndarray of shape (batch_size,) to multiply with batch loss.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p><p>tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
|
||||
total_loss (float): sum of all head losses
|
||||
losses (list of float): list of all losses. The order is list of target losses followed by list</p>
|
||||
<blockquote>
|
||||
@@ -564,10 +523,8 @@ losses (list of float): list of all losses. The order is list of target losses f
|
||||
<p>norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
|
||||
fetched_tensors: all values for additional_fetches</p>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -590,47 +547,39 @@ between them.</p>
|
||||
<code class="descname">apply_gradients_and_sync_networks</code><span class="sig-paren">(</span><em>reset_gradients=True</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.apply_gradients_and_sync_networks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_and_sync_networks" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Applies the gradients accumulated in the online network to the global network or to itself and syncs the
|
||||
networks if necessary</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>reset_gradients</strong> – If set to True, the accumulated gradients wont be reset to 0 after applying them to
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>reset_gradients</strong> – If set to True, the accumulated gradients wont be reset to 0 after applying them to
|
||||
the network. this is useful when the accumulated gradients are overwritten instead
|
||||
if accumulated by the accumulate_gradients function. this allows reducing time
|
||||
complexity for this function by around 10%</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
complexity for this function by around 10%</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_global_network">
|
||||
<code class="descname">apply_gradients_to_global_network</code><span class="sig-paren">(</span><em>gradients=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.apply_gradients_to_global_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_global_network" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Apply gradients from the online network on the global network</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>gradients</strong> – optional gradients that will be used instead of teh accumulated gradients</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>gradients</strong> – optional gradients that will be used instead of teh accumulated gradients</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_online_network">
|
||||
<code class="descname">apply_gradients_to_online_network</code><span class="sig-paren">(</span><em>gradients=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.apply_gradients_to_online_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.apply_gradients_to_online_network" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Apply gradients from the online network on itself</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -639,124 +588,106 @@ complexity for this function by around 10%</td>
|
||||
<dd><p>Collect all of network’s savers for global or online network
|
||||
Note: global, online, and target network are all copies fo the same network which parameters that are</p>
|
||||
<blockquote>
|
||||
<div>updated at different rates. So we only need to save one of the networks; the one that holds the most
|
||||
<div><p>updated at different rates. So we only need to save one of the networks; the one that holds the most
|
||||
recent parameters. target network is created for some agents and used for stabilizing training by
|
||||
updating parameters from online network at a slower rate. As a result, target network never contains
|
||||
the most recent set of parameters. In single-worker training, no global network is created and online
|
||||
network contains the most recent parameters. In vertical distributed training with more than one worker,
|
||||
global network is updated by all workers and contains the most recent parameters.
|
||||
Therefore preference is given to global network if it exists, otherwise online network is used
|
||||
for saving.</div></blockquote>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>parent_path_suffix</strong> – path suffix of the parent of the network wrapper
|
||||
(e.g. could be name of level manager plus name of agent)</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">collection of all checkpoint objects</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
for saving.</p>
|
||||
</div></blockquote>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>parent_path_suffix</strong> – path suffix of the parent of the network wrapper
|
||||
(e.g. could be name of level manager plus name of agent)</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>collection of all checkpoint objects</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.parallel_prediction">
|
||||
<code class="descname">parallel_prediction</code><span class="sig-paren">(</span><em>network_input_tuples: List[Tuple]</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.parallel_prediction"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.parallel_prediction" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Run several network prediction in parallel. Currently this only supports running each of the network once.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>network_input_tuples</strong> – a list of tuples where the first element is the network (online_network,
|
||||
target_network or global_network) and the second element is the inputs</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the outputs of all the networks in the same order as the inputs were given</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>network_input_tuples</strong> – a list of tuples where the first element is the network (online_network,
|
||||
target_network or global_network) and the second element is the inputs</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the outputs of all the networks in the same order as the inputs were given</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.set_is_training">
|
||||
<code class="descname">set_is_training</code><span class="sig-paren">(</span><em>state: bool</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.set_is_training"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.set_is_training" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Set the phase of the network between training and testing</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – The current state (True = Training, False = Testing)</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>state</strong> – The current state (True = Training, False = Testing)</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.sync">
|
||||
<code class="descname">sync</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.sync"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.sync" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Initializes the weights of the networks to match each other</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.train_and_sync_networks">
|
||||
<code class="descname">train_and_sync_networks</code><span class="sig-paren">(</span><em>inputs</em>, <em>targets</em>, <em>additional_fetches=[]</em>, <em>importance_weights=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.train_and_sync_networks"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.train_and_sync_networks" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A generic training function that enables multi-threading training using a global network if necessary.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>inputs</strong> – The inputs for the network.</li>
|
||||
<li><strong>targets</strong> – The targets corresponding to the given inputs</li>
|
||||
<li><strong>additional_fetches</strong> – Any additional tensor the user wants to fetch</li>
|
||||
<li><strong>importance_weights</strong> – A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won’t be scaled</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>inputs</strong> – The inputs for the network.</p></li>
|
||||
<li><p><strong>targets</strong> – The targets corresponding to the given inputs</p></li>
|
||||
<li><p><strong>additional_fetches</strong> – Any additional tensor the user wants to fetch</p></li>
|
||||
<li><p><strong>importance_weights</strong> – A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won’t be scaled</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">The loss of the training iteration</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>The loss of the training iteration</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.update_online_network">
|
||||
<code class="descname">update_online_network</code><span class="sig-paren">(</span><em>rate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.update_online_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.update_online_network" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Copy weights: global network >>> online network</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.architectures.network_wrapper.NetworkWrapper.update_target_network">
|
||||
<code class="descname">update_target_network</code><span class="sig-paren">(</span><em>rate=1.0</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/architectures/network_wrapper.html#NetworkWrapper.update_target_network"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.architectures.network_wrapper.NetworkWrapper.update_target_network" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Copy weights: online network >>> target network</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>rate</strong> – the rate of copying the weights - 1 for copying exactly</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -775,7 +706,7 @@ error of this sample. If it is not given, the samples losses won’t be scaled</
|
||||
<a href="../data_stores/index.html" class="btn btn-neutral float-right" title="Data Stores" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../agents/value_optimization/qr_dqn.html" class="btn btn-neutral" title="Quantile Regression DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../agents/value_optimization/qr_dqn.html" class="btn btn-neutral float-left" title="Quantile Regression DQN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -784,7 +715,7 @@ error of this sample. If it is not given, the samples losses won’t be scaled</
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -801,27 +732,16 @@ error of this sample. If it is not given, the samples losses won’t be scaled</
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Core Types — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Core Types — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Orchestrators" href="orchestrators/index.html" />
|
||||
<link href="../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -196,23 +199,19 @@
|
||||
<dt id="rl_coach.core_types.ActionInfo">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.core_types.</code><code class="descname">ActionInfo</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List], all_action_probabilities: float = 0, action_value: float = 0.0, state_value: float = 0.0, max_action_value: float = None</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/core_types.html#ActionInfo"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.ActionInfo" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Action info is a class that holds an action and various additional information details about it</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action</strong> – the action</li>
|
||||
<li><strong>all_action_probabilities</strong> – the probability that the action was given when selecting it</li>
|
||||
<li><strong>action_value</strong> – the state-action value (Q value) of the action</li>
|
||||
<li><strong>state_value</strong> – the state value (V value) of the state where the action was taken</li>
|
||||
<li><strong>max_action_value</strong> – in case this is an action that was selected randomly, this is the value of the action
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action</strong> – the action</p></li>
|
||||
<li><p><strong>all_action_probabilities</strong> – the probability that the action was given when selecting it</p></li>
|
||||
<li><p><strong>action_value</strong> – the state-action value (Q value) of the action</p></li>
|
||||
<li><p><strong>state_value</strong> – the state value (V value) of the state where the action was taken</p></li>
|
||||
<li><p><strong>max_action_value</strong> – in case this is an action that was selected randomly, this is the value of the action
|
||||
that received the maximum value. if no value is given, the action is assumed to be the
|
||||
action with the maximum value</li>
|
||||
action with the maximum value</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -224,44 +223,37 @@ action with the maximum value</li>
|
||||
<dd><p>A wrapper around a list of transitions that helps extracting batches of parameters from it.
|
||||
For example, one can extract a list of states corresponding to the list of transitions.
|
||||
The class uses lazy evaluation in order to return each of the available parameters.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transitions</strong> – a list of transitions to extract the batch from</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>transitions</strong> – a list of transitions to extract the batch from</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Batch.actions">
|
||||
<code class="descname">actions</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.actions"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.actions" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>if the actions were not converted to a batch before, extract them to a batch and then return the batch</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the actions batch</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the actions of the batch</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the actions batch</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy array containing all the actions of the batch</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Batch.game_overs">
|
||||
<code class="descname">game_overs</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.game_overs"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.game_overs" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>if the game_overs were not converted to a batch before, extract them to a batch and then return the batch</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the game_overs batch</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the game over flags of the batch</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the game_overs batch</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy array containing all the game over flags of the batch</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -269,16 +261,14 @@ The class uses lazy evaluation in order to return each of the available paramete
|
||||
<code class="descname">goals</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.goals"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.goals" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>if the goals were not converted to a batch before, extract them to a batch and then return the batch
|
||||
if the goal was not filled, this will raise an exception</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the goals batch</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the goals of the batch</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the goals batch</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy array containing all the goals of the batch</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -286,16 +276,14 @@ if the goal was not filled, this will raise an exception</p>
|
||||
<code class="descname">info</code><span class="sig-paren">(</span><em>key</em>, <em>expand_dims=False</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.info"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.info" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
|
||||
batch. if the key is not part of the keys in the info dictionary, this will raise an exception</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the info batch</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the info values of the batch corresponding to the given key</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the info batch</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy array containing all the info values of the batch corresponding to the given key</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -309,9 +297,9 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Batch.n_step_discounted_rewards">
|
||||
<code class="descname">n_step_discounted_rewards</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.n_step_discounted_rewards"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.n_step_discounted_rewards" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><dl class="docutils">
|
||||
<dt>if the n_step_discounted_rewards were not converted to a batch before, extract them to a batch and then return</dt>
|
||||
<dd>the batch</dd>
|
||||
<dd><dl class="simple">
|
||||
<dt>if the n_step_discounted_rewards were not converted to a batch before, extract them to a batch and then return</dt><dd><p>the batch</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<p>if the n step discounted rewards were not filled, this will raise an exception
|
||||
:param expand_dims: add an extra dimension to the total_returns batch
|
||||
@@ -323,85 +311,69 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
|
||||
<code class="descname">next_states</code><span class="sig-paren">(</span><em>fetches: List[str], expand_dims=False</em><span class="sig-paren">)</span> → Dict[str, numpy.ndarray]<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.next_states"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.next_states" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>follow the keys in fetches to extract the corresponding items from the next states in the batch
|
||||
if these keys were not already extracted before. return only the values corresponding to those keys</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>fetches</strong> – the keys of the state dictionary to extract</li>
|
||||
<li><strong>expand_dims</strong> – add an extra dimension to each of the value batches</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>fetches</strong> – the keys of the state dictionary to extract</p></li>
|
||||
<li><p><strong>expand_dims</strong> – add an extra dimension to each of the value batches</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">a dictionary containing a batch of values correponding to each of the given fetches keys</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a dictionary containing a batch of values correponding to each of the given fetches keys</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Batch.rewards">
|
||||
<code class="descname">rewards</code><span class="sig-paren">(</span><em>expand_dims=False</em><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.rewards"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.rewards" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>if the rewards were not converted to a batch before, extract them to a batch and then return the batch</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>expand_dims</strong> – add an extra dimension to the rewards batch</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a numpy array containing all the rewards of the batch</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>expand_dims</strong> – add an extra dimension to the rewards batch</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a numpy array containing all the rewards of the batch</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Batch.shuffle">
|
||||
<code class="descname">shuffle</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.shuffle"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.shuffle" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Shuffle all the transitions in the batch</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.core_types.Batch.size">
|
||||
<code class="descname">size</code><a class="headerlink" href="#rl_coach.core_types.Batch.size" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the size of the batch</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>the size of the batch</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Batch.slice">
|
||||
<code class="descname">slice</code><span class="sig-paren">(</span><em>start</em>, <em>end</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.slice"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.slice" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Keep a slice from the batch and discard the rest of the batch</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>start</strong> – the start index in the slice</li>
|
||||
<li><strong>end</strong> – the end index in the slice</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>start</strong> – the start index in the slice</p></li>
|
||||
<li><p><strong>end</strong> – the end index in the slice</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">None</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -409,21 +381,17 @@ if these keys were not already extracted before. return only the values correspo
|
||||
<code class="descname">states</code><span class="sig-paren">(</span><em>fetches: List[str], expand_dims=False</em><span class="sig-paren">)</span> → Dict[str, numpy.ndarray]<a class="reference internal" href="../_modules/rl_coach/core_types.html#Batch.states"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Batch.states" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>follow the keys in fetches to extract the corresponding items from the states in the batch
|
||||
if these keys were not already extracted before. return only the values corresponding to those keys</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>fetches</strong> – the keys of the state dictionary to extract</li>
|
||||
<li><strong>expand_dims</strong> – add an extra dimension to each of the value batches</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>fetches</strong> – the keys of the state dictionary to extract</p></li>
|
||||
<li><p><strong>expand_dims</strong> – add an extra dimension to each of the value batches</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">a dictionary containing a batch of values correponding to each of the given fetches keys</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a dictionary containing a batch of values correponding to each of the given fetches keys</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -436,23 +404,19 @@ if these keys were not already extracted before. return only the values correspo
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.core_types.</code><code class="descname">EnvResponse</code><span class="sig-paren">(</span><em>next_state: Dict[str, numpy.ndarray], reward: Union[int, float, numpy.ndarray], game_over: bool, info: Dict = None, goal: numpy.ndarray = None</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/core_types.html#EnvResponse"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.EnvResponse" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>An env response is a collection containing the information returning from the environment after a single action
|
||||
has been performed on it.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>next_state</strong> – The new state that the environment has transitioned into. Assumed to be a dictionary where the
|
||||
observation is located at state[‘observation’]</li>
|
||||
<li><strong>reward</strong> – The reward received from the environment</li>
|
||||
<li><strong>game_over</strong> – A boolean which should be True if the episode terminated after
|
||||
the execution of the action.</li>
|
||||
<li><strong>info</strong> – any additional info from the environment</li>
|
||||
<li><strong>goal</strong> – a goal defined by the environment</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>next_state</strong> – The new state that the environment has transitioned into. Assumed to be a dictionary where the
|
||||
observation is located at state[‘observation’]</p></li>
|
||||
<li><p><strong>reward</strong> – The reward received from the environment</p></li>
|
||||
<li><p><strong>game_over</strong> – A boolean which should be True if the episode terminated after
|
||||
the execution of the action.</p></li>
|
||||
<li><p><strong>info</strong> – any additional info from the environment</p></li>
|
||||
<li><p><strong>goal</strong> – a goal defined by the environment</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -462,62 +426,50 @@ the execution of the action.</li>
|
||||
<dt id="rl_coach.core_types.Episode">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.core_types.</code><code class="descname">Episode</code><span class="sig-paren">(</span><em>discount: float = 0.99</em>, <em>bootstrap_total_return_from_old_policy: bool = False</em>, <em>n_step: int = -1</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>An Episode represents a set of sequential transitions, that end with a terminal state.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>discount</strong> – the discount factor to use when calculating total returns</li>
|
||||
<li><strong>bootstrap_total_return_from_old_policy</strong> – should the total return be bootstrapped from the values in the
|
||||
memory</li>
|
||||
<li><strong>n_step</strong> – the number of future steps to sum the reward over before bootstrapping</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>discount</strong> – the discount factor to use when calculating total returns</p></li>
|
||||
<li><p><strong>bootstrap_total_return_from_old_policy</strong> – should the total return be bootstrapped from the values in the
|
||||
memory</p></li>
|
||||
<li><p><strong>n_step</strong> – the number of future steps to sum the reward over before bootstrapping</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Episode.get_first_transition">
|
||||
<code class="descname">get_first_transition</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → rl_coach.core_types.Transition<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.get_first_transition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.get_first_transition" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the first transition in the episode, or None if there are no transitions available</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The first transition in the episode</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The first transition in the episode</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Episode.get_last_transition">
|
||||
<code class="descname">get_last_transition</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → rl_coach.core_types.Transition<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.get_last_transition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.get_last_transition" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the last transition in the episode, or None if there are no transition available</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The last transition in the episode</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The last transition in the episode</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Episode.get_transition">
|
||||
<code class="descname">get_transition</code><span class="sig-paren">(</span><em>transition_idx: int</em><span class="sig-paren">)</span> → rl_coach.core_types.Transition<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.get_transition"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.get_transition" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get a specific transition by its index.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition_idx</strong> – The index of the transition to get</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">The transition which is stored in the given index</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>transition_idx</strong> – The index of the transition to get</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>The transition which is stored in the given index</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -526,16 +478,14 @@ memory</li>
|
||||
<dd><p>Get the values for some transition attribute from all the transitions in the episode.
|
||||
For example, this allows getting the rewards for all the transitions as a list by calling
|
||||
get_transitions_attribute(‘reward’)</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>attribute_name</strong> – The name of the attribute to extract from all the transitions</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A list of values from all the transitions according to the attribute given in attribute_name</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>attribute_name</strong> – The name of the attribute to extract from all the transitions</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>A list of values from all the transitions according to the attribute given in attribute_name</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -543,44 +493,36 @@ get_transitions_attribute(‘reward’)</p>
|
||||
<code class="descname">insert</code><span class="sig-paren">(</span><em>transition: rl_coach.core_types.Transition</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.insert"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.insert" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Insert a new transition to the episode. If the game_over flag in the transition is set to True,
|
||||
the episode will be marked as complete.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>transition</strong> – The new transition to insert to the episode</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>transition</strong> – The new transition to insert to the episode</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Episode.is_empty">
|
||||
<code class="descname">is_empty</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → bool<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.is_empty"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.is_empty" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Check if the episode is empty</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A boolean value determining if the episode is empty or not</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A boolean value determining if the episode is empty or not</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.core_types.Episode.length">
|
||||
<code class="descname">length</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → int<a class="reference internal" href="../_modules/rl_coach/core_types.html#Episode.length"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.core_types.Episode.length" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Return the length of the episode, which is the number of transitions it holds.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The number of transitions in the episode</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The number of transitions in the episode</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -590,14 +532,11 @@ the episode will be marked as complete.</p>
|
||||
The returns will be calculated according to the rewards of each transition, together with the number of steps
|
||||
to bootstrap from and the discount factor, as defined by n_step and discount respectively when initializing
|
||||
the episode.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -613,25 +552,21 @@ between the agent and the environment. The most basic version should contain the
|
||||
(current state, action, reward, next state, game over)
|
||||
For imitation learning algorithms, if the reward, next state or game over is not known,
|
||||
it is sufficient to store the current state and action taken by the expert.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>state</strong> – The current state. Assumed to be a dictionary where the observation
|
||||
is located at state[‘observation’]</li>
|
||||
<li><strong>action</strong> – The current action that was taken</li>
|
||||
<li><strong>reward</strong> – The reward received from the environment</li>
|
||||
<li><strong>next_state</strong> – The next state of the environment after applying the action.
|
||||
The next state should be similar to the state in its structure.</li>
|
||||
<li><strong>game_over</strong> – A boolean which should be True if the episode terminated after
|
||||
the execution of the action.</li>
|
||||
<li><strong>info</strong> – A dictionary containing any additional information to be stored in the transition</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>state</strong> – The current state. Assumed to be a dictionary where the observation
|
||||
is located at state[‘observation’]</p></li>
|
||||
<li><p><strong>action</strong> – The current action that was taken</p></li>
|
||||
<li><p><strong>reward</strong> – The reward received from the environment</p></li>
|
||||
<li><p><strong>next_state</strong> – The next state of the environment after applying the action.
|
||||
The next state should be similar to the state in its structure.</p></li>
|
||||
<li><p><strong>game_over</strong> – A boolean which should be True if the episode terminated after
|
||||
the execution of the action.</p></li>
|
||||
<li><p><strong>info</strong> – A dictionary containing any additional information to be stored in the transition</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -648,7 +583,7 @@ the execution of the action.</li>
|
||||
<a href="spaces.html" class="btn btn-neutral float-right" title="Spaces" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="orchestrators/index.html" class="btn btn-neutral" title="Orchestrators" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="orchestrators/index.html" class="btn btn-neutral float-left" title="Orchestrators" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -657,7 +592,7 @@ the execution of the action.</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -674,27 +609,16 @@ the execution of the action.</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Data Stores — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Data Stores — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Architectures" href="../architectures/index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -194,14 +197,11 @@
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.data_stores.s3_data_store.</code><code class="descname">S3DataStore</code><span class="sig-paren">(</span><em>params: rl_coach.data_stores.s3_data_store.S3DataStoreParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/data_stores/s3_data_store.html#S3DataStore"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.data_stores.s3_data_store.S3DataStore" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>An implementation of the data store using S3 for storing policy checkpoints when using Coach in distributed mode.
|
||||
The policy checkpoints are written by the trainer and read by the rollout worker.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The parameters required to use the S3 data store.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>params</strong> – The parameters required to use the S3 data store.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -212,14 +212,11 @@ The policy checkpoints are written by the trainer and read by the rollout worker
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.data_stores.nfs_data_store.</code><code class="descname">NFSDataStore</code><span class="sig-paren">(</span><em>params: rl_coach.data_stores.nfs_data_store.NFSDataStoreParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/data_stores/nfs_data_store.html#NFSDataStore"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.data_stores.nfs_data_store.NFSDataStore" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>An implementation of data store which uses NFS for storing policy checkpoints when using Coach in distributed mode.
|
||||
The policy checkpoints are written by the trainer and read by the rollout worker.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The parameters required to use the NFS data store.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>params</strong> – The parameters required to use the NFS data store.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -236,7 +233,7 @@ The policy checkpoints are written by the trainer and read by the rollout worker
|
||||
<a href="../environments/index.html" class="btn btn-neutral float-right" title="Environments" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../architectures/index.html" class="btn btn-neutral" title="Architectures" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../architectures/index.html" class="btn btn-neutral float-left" title="Architectures" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -245,7 +242,7 @@ The policy checkpoints are written by the trainer and read by the rollout worker
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -262,27 +259,16 @@ The policy checkpoints are written by the trainer and read by the rollout worker
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Environments — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Environments — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Data Stores" href="../data_stores/index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -193,106 +196,84 @@
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.environments.environment.Environment">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.environments.environment.</code><code class="descname">Environment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, **kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>level</strong> – The environment level. Each environment can have multiple levels</li>
|
||||
<li><strong>seed</strong> – a seed for the random number generator of the environment</li>
|
||||
<li><strong>frame_skip</strong> – number of frames to skip (while repeating the same action) between each two agent directives</li>
|
||||
<li><strong>human_control</strong> – human should control the environment</li>
|
||||
<li><strong>visualization_parameters</strong> – a blob of parameters used for visualization of the environment</li>
|
||||
<li><strong>**kwargs</strong> – <p>as the class is instantiated by EnvironmentParameters, this is used to support having
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>level</strong> – The environment level. Each environment can have multiple levels</p></li>
|
||||
<li><p><strong>seed</strong> – a seed for the random number generator of the environment</p></li>
|
||||
<li><p><strong>frame_skip</strong> – number of frames to skip (while repeating the same action) between each two agent directives</p></li>
|
||||
<li><p><strong>human_control</strong> – human should control the environment</p></li>
|
||||
<li><p><strong>visualization_parameters</strong> – a blob of parameters used for visualization of the environment</p></li>
|
||||
<li><p><strong>**kwargs</strong> – <p>as the class is instantiated by EnvironmentParameters, this is used to support having
|
||||
additional arguments which will be ignored by this class, but might be used by others</p>
|
||||
</li>
|
||||
</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.environments.environment.Environment.action_space">
|
||||
<code class="descname">action_space</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.action_space" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the action space of the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the action space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>the action space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.close">
|
||||
<code class="descname">close</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.close"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.close" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Clean up steps.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.get_action_from_user">
|
||||
<code class="descname">get_action_from_user</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → Union[int, float, numpy.ndarray, List]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_action_from_user"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_action_from_user" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get an action from the user keyboard</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">action index</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>action index</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.get_available_keys">
|
||||
<code class="descname">get_available_keys</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → List[Tuple[str, Union[int, float, numpy.ndarray, List]]]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_available_keys"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_available_keys" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Return a list of tuples mapping between action names and the keyboard key that triggers them</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a list of tuples mapping between action names and the keyboard key that triggers them</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a list of tuples mapping between action names and the keyboard key that triggers them</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.get_goal">
|
||||
<code class="descname">get_goal</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → Union[None, numpy.ndarray]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_goal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_goal" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the current goal that the agents needs to achieve in the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">The goal</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The goal</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.get_random_action">
|
||||
<code class="descname">get_random_action</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → Union[int, float, numpy.ndarray, List]<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_random_action"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_random_action" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Returns an action picked uniformly from the available actions</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a numpy array with a random action</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a numpy array with a random action</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -300,56 +281,44 @@ additional arguments which will be ignored by this class, but might be used by o
|
||||
<code class="descname">get_rendered_image</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.get_rendered_image"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.get_rendered_image" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Return a numpy array containing the image that will be rendered to the screen.
|
||||
This can be different from the observation. For example, mujoco’s observation is a measurements vector.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">numpy array containing the image that will be rendered to the screen</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>numpy array containing the image that will be rendered to the screen</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.environments.environment.Environment.goal_space">
|
||||
<code class="descname">goal_space</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.goal_space" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the state space of the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the observation space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>the observation space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.handle_episode_ended">
|
||||
<code class="descname">handle_episode_ended</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.handle_episode_ended"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.handle_episode_ended" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>End an episode</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.environments.environment.Environment.last_env_response">
|
||||
<code class="descname">last_env_response</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.last_env_response" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the last environment response</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a dictionary that contains the state, reward, etc.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a dictionary that contains the state, reward, etc.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
@@ -363,76 +332,64 @@ This can be different from the observation. For example, mujoco’s observation
|
||||
<dt id="rl_coach.environments.environment.Environment.render">
|
||||
<code class="descname">render</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.render"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.render" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Call the environment function for rendering to the screen</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.reset_internal_state">
|
||||
<code class="descname">reset_internal_state</code><span class="sig-paren">(</span><em>force_environment_reset=False</em><span class="sig-paren">)</span> → rl_coach.core_types.EnvResponse<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.reset_internal_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.reset_internal_state" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Reset the environment and all the variable of the wrapper</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>force_environment_reset</strong> – forces environment reset even when the game did not end</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A dictionary containing the observation, reward, done flag, action and measurements</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>force_environment_reset</strong> – forces environment reset even when the game did not end</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>A dictionary containing the observation, reward, done flag, action and measurements</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.set_goal">
|
||||
<code class="descname">set_goal</code><span class="sig-paren">(</span><em>goal: Union[None, numpy.ndarray]</em><span class="sig-paren">)</span> → None<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.set_goal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.set_goal" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Set the current goal that the agent needs to achieve in the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>goal</strong> – the goal that needs to be achieved</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">None</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>goal</strong> – the goal that needs to be achieved</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>None</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="attribute">
|
||||
<dt id="rl_coach.environments.environment.Environment.state_space">
|
||||
<code class="descname">state_space</code><a class="headerlink" href="#rl_coach.environments.environment.Environment.state_space" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get the state space of the environment</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">the observation space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>the observation space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.environments.environment.Environment.step">
|
||||
<code class="descname">step</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List]</em><span class="sig-paren">)</span> → rl_coach.core_types.EnvResponse<a class="reference internal" href="../../_modules/rl_coach/environments/environment.html#Environment.step"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.environment.Environment.step" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Make a single step in the environment using the given action</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – an action to use for stepping the environment. Should follow the definition of the action space.</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the environment response as returned in get_last_env_response</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – an action to use for stepping the environment. Should follow the definition of the action space.</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the environment response as returned in get_last_env_response</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -444,38 +401,34 @@ This can be different from the observation. For example, mujoco’s observation
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.environments.control_suite_environment.ControlSuiteEnvironment">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.environments.control_suite_environment.</code><code class="descname">ControlSuiteEnvironment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection</em>, <em>frame_skip: int</em>, <em>visualization_parameters: rl_coach.base_parameters.VisualizationParameters</em>, <em>target_success_rate: float = 1.0</em>, <em>seed: Union[None</em>, <em>int] = None</em>, <em>human_control: bool = False</em>, <em>observation_type: rl_coach.environments.control_suite_environment.ObservationType = <ObservationType.Measurements: 1></em>, <em>custom_reward_threshold: Union[int</em>, <em>float] = None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/control_suite_environment.html#ControlSuiteEnvironment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.control_suite_environment.ControlSuiteEnvironment" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>level</strong> – (str)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>level</strong> – (str)
|
||||
A string representing the control suite level to run. This can also be a LevelSelection object.
|
||||
For example, cartpole:swingup.</li>
|
||||
<li><strong>frame_skip</strong> – (int)
|
||||
For example, cartpole:swingup.</p></li>
|
||||
<li><p><strong>frame_skip</strong> – (int)
|
||||
The number of frames to skip between any two actions given by the agent. The action will be repeated
|
||||
for all the skipped frames.</li>
|
||||
<li><strong>visualization_parameters</strong> – (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.</li>
|
||||
<li><strong>target_success_rate</strong> – (float)
|
||||
Stop experiment if given target success rate was achieved.</li>
|
||||
<li><strong>seed</strong> – (int)
|
||||
A seed to use for the random number generator when running the environment.</li>
|
||||
<li><strong>human_control</strong> – (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.</li>
|
||||
<li><strong>observation_type</strong> – (ObservationType)
|
||||
for all the skipped frames.</p></li>
|
||||
<li><p><strong>visualization_parameters</strong> – (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.</p></li>
|
||||
<li><p><strong>target_success_rate</strong> – (float)
|
||||
Stop experiment if given target success rate was achieved.</p></li>
|
||||
<li><p><strong>seed</strong> – (int)
|
||||
A seed to use for the random number generator when running the environment.</p></li>
|
||||
<li><p><strong>human_control</strong> – (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.</p></li>
|
||||
<li><p><strong>observation_type</strong> – (ObservationType)
|
||||
An enum which defines which observation to use. The current options are to use:
|
||||
* Measurements only - a vector of joint torques and similar measurements
|
||||
* Image only - an image of the environment as seen by a camera attached to the simulator
|
||||
* Measurements & Image - both type of observations will be returned in the state using the keys
|
||||
‘measurements’ and ‘pixels’ respectively.</li>
|
||||
<li><strong>custom_reward_threshold</strong> – (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</li>
|
||||
‘measurements’ and ‘pixels’ respectively.</p></li>
|
||||
<li><p><strong>custom_reward_threshold</strong> – (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -496,47 +449,39 @@ Allows defining a custom reward that will be used to decide when the agent succe
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.environments.doom_environment.DoomEnvironment">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.environments.doom_environment.</code><code class="descname">DoomEnvironment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, cameras: List[rl_coach.environments.doom_environment.DoomEnvironment.CameraTypes], target_success_rate: float = 1.0, **kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/doom_environment.html#DoomEnvironment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.doom_environment.DoomEnvironment" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>level</strong> – (str)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>level</strong> – (str)
|
||||
A string representing the doom level to run. This can also be a LevelSelection object.
|
||||
This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.</li>
|
||||
<li><strong>seed</strong> – (int)
|
||||
A seed to use for the random number generator when running the environment.</li>
|
||||
<li><strong>frame_skip</strong> – (int)
|
||||
This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.</p></li>
|
||||
<li><p><strong>seed</strong> – (int)
|
||||
A seed to use for the random number generator when running the environment.</p></li>
|
||||
<li><p><strong>frame_skip</strong> – (int)
|
||||
The number of frames to skip between any two actions given by the agent. The action will be repeated
|
||||
for all the skipped frames.</li>
|
||||
<li><strong>human_control</strong> – (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.</li>
|
||||
<li><strong>custom_reward_threshold</strong> – (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</li>
|
||||
<li><strong>visualization_parameters</strong> – (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.</li>
|
||||
<li><strong>cameras</strong> – <p>(List[CameraTypes])
|
||||
for all the skipped frames.</p></li>
|
||||
<li><p><strong>human_control</strong> – (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.</p></li>
|
||||
<li><p><strong>custom_reward_threshold</strong> – (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.</p></li>
|
||||
<li><p><strong>visualization_parameters</strong> – (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.</p></li>
|
||||
<li><p><strong>cameras</strong> – <p>(List[CameraTypes])
|
||||
A list of camera types to use as observation in the state returned from the environment.
|
||||
Each camera should be an enum from CameraTypes, and there are several options like an RGB observation,
|
||||
a depth map, a segmentation map, and a top down map of the enviornment.</p>
|
||||
<blockquote>
|
||||
<div><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name" colspan="2">param target_success_rate:</th></tr>
|
||||
<tr class="field-odd field"><td> </td><td class="field-body">(float)
|
||||
Stop experiment if given target success rate was achieved.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<div><dl class="field-list simple">
|
||||
<dt class="field-odd">param target_success_rate</dt>
|
||||
<dd class="field-odd"><p>(float)
|
||||
Stop experiment if given target success rate was achieved.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</div></blockquote>
|
||||
</li>
|
||||
</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -557,54 +502,50 @@ Additionally, it can be extended using the API defined by the authors.</p>
|
||||
<p>Website: <a class="reference external" href="https://gym.openai.com/">OpenAI Gym</a></p>
|
||||
<p>In Coach, we support all the native environments in Gym, along with several extensions such as:</p>
|
||||
<ul class="simple">
|
||||
<li><a class="reference external" href="https://github.com/openai/roboschool">Roboschool</a> - a set of environments powered by the PyBullet engine,
|
||||
that offer a free alternative to MuJoCo.</li>
|
||||
<li><a class="reference external" href="https://github.com/Breakend/gym-extensions">Gym Extensions</a> - a set of environments that extends Gym for
|
||||
auxiliary tasks (multitask learning, transfer learning, inverse reinforcement learning, etc.)</li>
|
||||
<li><a class="reference external" href="https://github.com/bulletphysics/bullet3/tree/master/examples/pybullet">PyBullet</a> - a physics engine that
|
||||
includes a set of robotics environments.</li>
|
||||
<li><p><a class="reference external" href="https://github.com/openai/roboschool">Roboschool</a> - a set of environments powered by the PyBullet engine,
|
||||
that offer a free alternative to MuJoCo.</p></li>
|
||||
<li><p><a class="reference external" href="https://github.com/Breakend/gym-extensions">Gym Extensions</a> - a set of environments that extends Gym for
|
||||
auxiliary tasks (multitask learning, transfer learning, inverse reinforcement learning, etc.)</p></li>
|
||||
<li><p><a class="reference external" href="https://github.com/bulletphysics/bullet3/tree/master/examples/pybullet">PyBullet</a> - a physics engine that
|
||||
includes a set of robotics environments.</p></li>
|
||||
</ul>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.environments.gym_environment.GymEnvironment">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.environments.gym_environment.</code><code class="descname">GymEnvironment</code><span class="sig-paren">(</span><em>level: rl_coach.environments.environment.LevelSelection</em>, <em>frame_skip: int</em>, <em>visualization_parameters: rl_coach.base_parameters.VisualizationParameters</em>, <em>target_success_rate: float = 1.0</em>, <em>additional_simulator_parameters: Dict[str</em>, <em>Any] = {}</em>, <em>seed: Union[None</em>, <em>int] = None</em>, <em>human_control: bool = False</em>, <em>custom_reward_threshold: Union[int</em>, <em>float] = None</em>, <em>random_initialization_steps: int = 1</em>, <em>max_over_num_frames: int = 1</em>, <em>observation_space_type: rl_coach.environments.gym_environment.ObservationSpaceType = None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/environments/gym_environment.html#GymEnvironment"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.environments.gym_environment.GymEnvironment" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>level</strong> – (str)
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>level</strong> – (str)
|
||||
A string representing the gym level to run. This can also be a LevelSelection object.
|
||||
For example, BreakoutDeterministic-v0</li>
|
||||
<li><strong>frame_skip</strong> – (int)
|
||||
For example, BreakoutDeterministic-v0</p></li>
|
||||
<li><p><strong>frame_skip</strong> – (int)
|
||||
The number of frames to skip between any two actions given by the agent. The action will be repeated
|
||||
for all the skipped frames.</li>
|
||||
<li><strong>visualization_parameters</strong> – (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.</li>
|
||||
<li><strong>additional_simulator_parameters</strong> – (Dict[str, Any])
|
||||
for all the skipped frames.</p></li>
|
||||
<li><p><strong>visualization_parameters</strong> – (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.</p></li>
|
||||
<li><p><strong>additional_simulator_parameters</strong> – (Dict[str, Any])
|
||||
Any additional parameters that the user can pass to the Gym environment. These parameters should be
|
||||
accepted by the __init__ function of the implemented Gym environment.</li>
|
||||
<li><strong>seed</strong> – (int)
|
||||
A seed to use for the random number generator when running the environment.</li>
|
||||
<li><strong>human_control</strong> – (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.</li>
|
||||
<li><strong>custom_reward_threshold</strong> – (float)
|
||||
accepted by the __init__ function of the implemented Gym environment.</p></li>
|
||||
<li><p><strong>seed</strong> – (int)
|
||||
A seed to use for the random number generator when running the environment.</p></li>
|
||||
<li><p><strong>human_control</strong> – (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.</p></li>
|
||||
<li><p><strong>custom_reward_threshold</strong> – (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
|
||||
If not set, this value will be taken from the Gym environment definition.</li>
|
||||
<li><strong>random_initialization_steps</strong> – (int)
|
||||
If not set, this value will be taken from the Gym environment definition.</p></li>
|
||||
<li><p><strong>random_initialization_steps</strong> – (int)
|
||||
The number of random steps that will be taken in the environment after each reset.
|
||||
This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.</li>
|
||||
<li><strong>max_over_num_frames</strong> – (int)
|
||||
This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.</p></li>
|
||||
<li><p><strong>max_over_num_frames</strong> – (int)
|
||||
This value will be used for merging multiple frames into a single frame by taking the maximum value for each
|
||||
of the pixels in the frame. This is particularly used in Atari games, where the frames flicker, and objects
|
||||
can be seen in one frame but disappear in the next.</li>
|
||||
<li><strong>observation_space_type</strong> – This value will be used for generating observation space. Allows a custom space. Should be one of
|
||||
can be seen in one frame but disappear in the next.</p></li>
|
||||
<li><p><strong>observation_space_type</strong> – This value will be used for generating observation space. Allows a custom space. Should be one of
|
||||
ObservationSpaceType. If not specified, observation space is inferred from the number of dimensions
|
||||
of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, PlanarMaps space otherwise.</li>
|
||||
of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, PlanarMaps space otherwise.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -621,7 +562,7 @@ of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, Planar
|
||||
<a href="../exploration_policies/index.html" class="btn btn-neutral float-right" title="Exploration Policies" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../data_stores/index.html" class="btn btn-neutral" title="Data Stores" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../data_stores/index.html" class="btn btn-neutral float-left" title="Data Stores" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -630,7 +571,7 @@ of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, Planar
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -647,27 +588,16 @@ of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, Planar
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Exploration Policies — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Exploration Policies — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Environments" href="../environments/index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -202,62 +205,62 @@ predefined policy. This is one of the most important aspects of reinforcement le
|
||||
tuning to get it right. Coach supports several pre-defined exploration policies, and it can be easily extended with
|
||||
custom policies. Note that not all exploration policies are expected to work for both discrete and continuous action
|
||||
spaces.</p>
|
||||
<table border="1" class="docutils">
|
||||
<table class="docutils align-center">
|
||||
<colgroup>
|
||||
<col width="35%" />
|
||||
<col width="37%" />
|
||||
<col width="29%" />
|
||||
<col style="width: 35%" />
|
||||
<col style="width: 37%" />
|
||||
<col style="width: 29%" />
|
||||
</colgroup>
|
||||
<thead valign="bottom">
|
||||
<tr class="row-odd"><th class="head">Exploration Policy</th>
|
||||
<th class="head">Discrete Action Space</th>
|
||||
<th class="head">Box Action Space</th>
|
||||
<thead>
|
||||
<tr class="row-odd"><th class="head"><p>Exploration Policy</p></th>
|
||||
<th class="head"><p>Discrete Action Space</p></th>
|
||||
<th class="head"><p>Box Action Space</p></th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody valign="top">
|
||||
<tr class="row-even"><td>AdditiveNoise</td>
|
||||
<td><span class="red">X</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tbody>
|
||||
<tr class="row-even"><td><p>AdditiveNoise</p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-odd"><td>Boltzmann</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="red">X</span></td>
|
||||
<tr class="row-odd"><td><p>Boltzmann</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-even"><td>Bootstrapped</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="red">X</span></td>
|
||||
<tr class="row-even"><td><p>Bootstrapped</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-odd"><td>Categorical</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="red">X</span></td>
|
||||
<tr class="row-odd"><td><p>Categorical</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-even"><td>ContinuousEntropy</td>
|
||||
<td><span class="red">X</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tr class="row-even"><td><p>ContinuousEntropy</p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-odd"><td>EGreedy</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tr class="row-odd"><td><p>EGreedy</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-even"><td>Greedy</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tr class="row-even"><td><p>Greedy</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-odd"><td>OUProcess</td>
|
||||
<td><span class="red">X</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tr class="row-odd"><td><p>OUProcess</p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-even"><td>ParameterNoise</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tr class="row-even"><td><p>ParameterNoise</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-odd"><td>TruncatedNormal</td>
|
||||
<td><span class="red">X</span></td>
|
||||
<td><span class="green">V</span></td>
|
||||
<tr class="row-odd"><td><p>TruncatedNormal</p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
</tr>
|
||||
<tr class="row-even"><td>UCB</td>
|
||||
<td><span class="green">V</span></td>
|
||||
<td><span class="red">X</span></td>
|
||||
<tr class="row-even"><td><p>UCB</p></td>
|
||||
<td><p><span class="green">V</span></p></td>
|
||||
<td><p><span class="red">X</span></p></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
@@ -268,14 +271,11 @@ spaces.</p>
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.exploration_policies.exploration_policy.</code><code class="descname">ExplorationPolicy</code><span class="sig-paren">(</span><em>action_space: rl_coach.spaces.ActionSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/exploration_policies/exploration_policy.html#ExplorationPolicy"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.exploration_policies.exploration_policy.ExplorationPolicy" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>An exploration policy takes the predicted actions or action values from the agent, and selects the action to
|
||||
actually apply to the environment using some predefined algorithm.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.exploration_policies.exploration_policy.ExplorationPolicy.change_phase">
|
||||
<code class="descname">change_phase</code><span class="sig-paren">(</span><em>phase</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/exploration_policies/exploration_policy.html#ExplorationPolicy.change_phase"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.exploration_policies.exploration_policy.ExplorationPolicy.change_phase" title="Permalink to this definition">¶</a></dt>
|
||||
@@ -323,20 +323,16 @@ can be given in two different ways:
|
||||
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
|
||||
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space</li>
|
||||
<li><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space</p></li>
|
||||
<li><p><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -349,18 +345,14 @@ of the action space</li>
|
||||
actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
|
||||
into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
|
||||
An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>temperature_schedule</strong> – the schedule for the temperature parameter of the softmax</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>temperature_schedule</strong> – the schedule for the temperature parameter of the softmax</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -375,26 +367,22 @@ values for all the possible actions. For each episode, a single head is selected
|
||||
to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
|
||||
predictions.</p>
|
||||
<div class="admonition note">
|
||||
<p class="first admonition-title">Note</p>
|
||||
<p class="last">This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
|
||||
<p class="admonition-title">Note</p>
|
||||
<p>This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
|
||||
since it requires the agent to have a network with multiple heads.</p>
|
||||
</div>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>epsilon_schedule</strong> – a schedule for the epsilon values</li>
|
||||
<li><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</li>
|
||||
<li><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy</li>
|
||||
<li><strong>architecture_num_q_heads</strong> – the number of q heads to select from</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>epsilon_schedule</strong> – a schedule for the epsilon values</p></li>
|
||||
<li><p><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</p></li>
|
||||
<li><p><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy</p></li>
|
||||
<li><p><strong>architecture_num_q_heads</strong> – the number of q heads to select from</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -407,14 +395,11 @@ if the e-greedy is used for a continuous policy</li>
|
||||
represent a probability distribution over the action, from which a single action will be sampled.
|
||||
In evaluation, the action that has the highest probability will be selected. This is particularly useful for
|
||||
actor-critic schemes, where the actors output is a probability distribution over the actions.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -429,24 +414,20 @@ implemented by adding a regularization factor to the network loss, which regular
|
||||
This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
|
||||
is implemented as part of the head.</p>
|
||||
<div class="admonition warning">
|
||||
<p class="first admonition-title">Warning</p>
|
||||
<p class="last">This exploration policy expects the agent or the network to implement the exploration functionality.
|
||||
<p class="admonition-title">Warning</p>
|
||||
<p>This exploration policy expects the agent or the network to implement the exploration functionality.
|
||||
There are only a few heads that actually are relevant and implement the entropy regularization factor.</p>
|
||||
</div>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space</li>
|
||||
<li><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space</p></li>
|
||||
<li><p><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -464,21 +445,17 @@ In evaluation, a different epsilon value can be specified.</p>
|
||||
it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
|
||||
given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
|
||||
always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>epsilon_schedule</strong> – a schedule for the epsilon values</li>
|
||||
<li><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</li>
|
||||
<li><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>epsilon_schedule</strong> – a schedule for the epsilon values</p></li>
|
||||
<li><p><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</p></li>
|
||||
<li><p><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -490,14 +467,11 @@ if the e-greedy is used for a continuous policy</li>
|
||||
<dd><p>The Greedy exploration policy is intended for both discrete and continuous action spaces.
|
||||
For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
|
||||
For continuous action spaces, it always return the exact action, as it was given by the agent.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -509,14 +483,11 @@ For continuous action spaces, it always return the exact action, as it was given
|
||||
<dd><p>OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
|
||||
an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
|
||||
the samples are correlated between consequent time steps.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -531,14 +502,11 @@ The noisy layers have both weight means and weight standard deviations, and for
|
||||
the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
|
||||
values.</p>
|
||||
<p>Warning: currently supported only by DQN variants</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action_space</strong> – the action space used by the environment</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action_space</strong> – the action space used by the environment</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -555,20 +523,16 @@ wo different ways:
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.
|
||||
When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
|
||||
is within the bounds.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space</li>
|
||||
<li><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>noise_percentage_schedule</strong> – the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space</p></li>
|
||||
<li><p><strong>evaluation_noise_percentage</strong> – the noise variance percentage that will be used during evaluation phases</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -584,23 +548,19 @@ It then updates the action value estimates to by mean(actions)+lambda*stdev(acti
|
||||
given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
|
||||
and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
|
||||
the outcome from those actions to be.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>action_space</strong> – the action space used by the environment</li>
|
||||
<li><strong>epsilon_schedule</strong> – a schedule for the epsilon values</li>
|
||||
<li><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</li>
|
||||
<li><strong>architecture_num_q_heads</strong> – the number of q heads to select from</li>
|
||||
<li><strong>lamb</strong> – lambda coefficient for taking the standard deviation into account</li>
|
||||
<li><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>action_space</strong> – the action space used by the environment</p></li>
|
||||
<li><p><strong>epsilon_schedule</strong> – a schedule for the epsilon values</p></li>
|
||||
<li><p><strong>evaluation_epsilon</strong> – the epsilon value to use for evaluation phases</p></li>
|
||||
<li><p><strong>architecture_num_q_heads</strong> – the number of q heads to select from</p></li>
|
||||
<li><p><strong>lamb</strong> – lambda coefficient for taking the standard deviation into account</p></li>
|
||||
<li><p><strong>continuous_exploration_policy_parameters</strong> – the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -617,7 +577,7 @@ if the e-greedy is used for a continuous policy</li>
|
||||
<a href="../filters/index.html" class="btn btn-neutral float-right" title="Filters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../environments/index.html" class="btn btn-neutral" title="Environments" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../environments/index.html" class="btn btn-neutral float-left" title="Environments" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -626,7 +586,7 @@ if the e-greedy is used for a continuous policy</li>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -643,27 +603,16 @@ if the e-greedy is used for a continuous policy</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Filters — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Filters — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Exploration Policies" href="../exploration_policies/index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -197,13 +200,13 @@
|
||||
<p>Filters are a mechanism in Coach that allows doing pre-processing and post-processing of the internal agent information.
|
||||
There are two filter categories -</p>
|
||||
<ul class="simple">
|
||||
<li><strong>Input filters</strong> - these are filters that process the information passed <strong>into</strong> the agent from the environment.
|
||||
<li><p><strong>Input filters</strong> - these are filters that process the information passed <strong>into</strong> the agent from the environment.
|
||||
This information includes the observation and the reward. Input filters therefore allow rescaling observations,
|
||||
normalizing rewards, stack observations, etc.</li>
|
||||
<li><strong>Output filters</strong> - these are filters that process the information going <strong>out</strong> of the agent into the environment.
|
||||
normalizing rewards, stack observations, etc.</p></li>
|
||||
<li><p><strong>Output filters</strong> - these are filters that process the information going <strong>out</strong> of the agent into the environment.
|
||||
This information includes the action the agent chooses to take. Output filters therefore allow conversion of
|
||||
actions from one space into another. For example, the agent can take <span class="math notranslate nohighlight">\(N\)</span> discrete actions, that will be mapped by
|
||||
the output filter onto <span class="math notranslate nohighlight">\(N\)</span> continuous actions.</li>
|
||||
the output filter onto <span class="math notranslate nohighlight">\(N\)</span> continuous actions.</p></li>
|
||||
</ul>
|
||||
<p>Filters can be stacked on top of each other in order to build complex processing flows of the inputs or outputs.</p>
|
||||
<a class="reference internal image-reference" href="../../_images/filters.png"><img alt="../../_images/filters.png" class="align-center" src="../../_images/filters.png" style="width: 350px;" /></a>
|
||||
@@ -220,7 +223,7 @@ the output filter onto <span class="math notranslate nohighlight">\(N\)</span> c
|
||||
<a href="input_filters.html" class="btn btn-neutral float-right" title="Input Filters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../exploration_policies/index.html" class="btn btn-neutral" title="Exploration Policies" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../exploration_policies/index.html" class="btn btn-neutral float-left" title="Exploration Policies" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -229,7 +232,7 @@ the output filter onto <span class="math notranslate nohighlight">\(N\)</span> c
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -246,27 +249,16 @@ the output filter onto <span class="math notranslate nohighlight">\(N\)</span> c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Input Filters — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Input Filters — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Filters" href="index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -223,18 +226,14 @@
|
||||
For example, if the observation consists of measurements in an arbitrary range,
|
||||
and we want to control the minimum and maximum values of these observations,
|
||||
we can define a range and clip the values of the measurements.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>clipping_low</strong> – The minimum value to allow after normalizing the observation</li>
|
||||
<li><strong>clipping_high</strong> – The maximum value to allow after normalizing the observation</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>clipping_low</strong> – The minimum value to allow after normalizing the observation</p></li>
|
||||
<li><p><strong>clipping_high</strong> – The maximum value to allow after normalizing the observation</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -246,20 +245,16 @@ we can define a range and clip the values of the measurements.</p>
|
||||
<dd><p>Crops the size of the observation to a given crop window. For example, in Atari, the
|
||||
observations are images with a shape of 210x160. Usually, we will want to crop the size of the observation to a
|
||||
square of 160x160 before rescaling them.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>crop_low</strong> – a vector where each dimension describes the start index for cropping the observation in the
|
||||
corresponding dimension. a negative value of -1 will be mapped to the max size</li>
|
||||
<li><strong>crop_high</strong> – a vector where each dimension describes the end index for cropping the observation in the
|
||||
corresponding dimension. a negative value of -1 will be mapped to the max size</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>crop_low</strong> – a vector where each dimension describes the start index for cropping the observation in the
|
||||
corresponding dimension. a negative value of -1 will be mapped to the max size</p></li>
|
||||
<li><p><strong>crop_high</strong> – a vector where each dimension describes the end index for cropping the observation in the
|
||||
corresponding dimension. a negative value of -1 will be mapped to the max size</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -270,18 +265,14 @@ corresponding dimension. a negative value of -1 will be mapped to the max size</
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationMoveAxisFilter</code><span class="sig-paren">(</span><em>axis_origin: int = None</em>, <em>axis_target: int = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_move_axis_filter.html#ObservationMoveAxisFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationMoveAxisFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Reorders the axes of the observation. This can be useful when the observation is an
|
||||
image, and we want to move the channel axis to be the last axis instead of the first axis.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>axis_origin</strong> – The axis to move</li>
|
||||
<li><strong>axis_target</strong> – Where to move the selected axis to</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>axis_origin</strong> – The axis to move</p></li>
|
||||
<li><p><strong>axis_target</strong> – Where to move the selected axis to</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -293,18 +284,14 @@ image, and we want to move the channel axis to be the last axis instead of the f
|
||||
<dd><p>Normalizes the observation values with a running mean and standard deviation of
|
||||
all the observations seen so far. The normalization is performed element-wise. Additionally, when working with
|
||||
multiple workers, the statistics used for the normalization operation are accumulated over all the workers.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>clip_min</strong> – The minimum value to allow after normalizing the observation</li>
|
||||
<li><strong>clip_max</strong> – The maximum value to allow after normalizing the observation</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>clip_min</strong> – The minimum value to allow after normalizing the observation</p></li>
|
||||
<li><p><strong>clip_max</strong> – The maximum value to allow after normalizing the observation</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -319,18 +306,14 @@ measurements, but you want the agent to only see some of the measurements and no
|
||||
For example, the CARLA environment extracts multiple measurements that can be used by the agent, such as
|
||||
speed and location. If we want to only use the speed, it can be done using this filter.
|
||||
This will currently work only for VectorObservationSpace observations</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>part_names</strong> – A list of part names to reduce</li>
|
||||
<li><strong>reduction_method</strong> – A reduction method to use - keep or discard the given parts</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>part_names</strong> – A list of part names to reduce</p></li>
|
||||
<li><p><strong>reduction_method</strong> – A reduction method to use - keep or discard the given parts</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -338,22 +321,14 @@ This will currently work only for VectorObservationSpace observations</p>
|
||||
<h3>ObservationRescaleSizeByFactorFilter<a class="headerlink" href="#observationrescalesizebyfactorfilter" title="Permalink to this headline">¶</a></h3>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleSizeByFactorFilter</code><span class="sig-paren">(</span><em>rescale_factor: float</em>, <em>rescaling_interpolation_type: rl_coach.filters.observation.observation_rescale_size_by_factor_filter.RescaleInterpolationType</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_size_by_factor_filter.html#ObservationRescaleSizeByFactorFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleSizeByFactorFilter</code><span class="sig-paren">(</span><em>rescale_factor: float</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_size_by_factor_filter.html#ObservationRescaleSizeByFactorFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Rescales an image observation by some factor. For example, the image size
|
||||
can be reduced by a factor of 2.
|
||||
Warning: this requires the input observation to be of type uint8 due to scipy requirements!</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>rescale_factor</strong> – the factor by which the observation will be rescaled</li>
|
||||
<li><strong>rescaling_interpolation_type</strong> – the interpolation type for rescaling</li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
can be reduced by a factor of 2.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>rescale_factor</strong> – the factor by which the observation will be rescaled</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -361,22 +336,15 @@ Warning: this requires the input observation to be of type uint8 due to scipy re
|
||||
<h3>ObservationRescaleToSizeFilter<a class="headerlink" href="#observationrescaletosizefilter" title="Permalink to this headline">¶</a></h3>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.filters.observation.ObservationRescaleToSizeFilter">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleToSizeFilter</code><span class="sig-paren">(</span><em>output_observation_space: rl_coach.spaces.PlanarMapsObservationSpace</em>, <em>rescaling_interpolation_type: rl_coach.filters.observation.observation_rescale_to_size_filter.RescaleInterpolationType = <RescaleInterpolationType.BILINEAR: 'bilinear'></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_to_size_filter.html#ObservationRescaleToSizeFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleToSizeFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationRescaleToSizeFilter</code><span class="sig-paren">(</span><em>output_observation_space: rl_coach.spaces.PlanarMapsObservationSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_rescale_to_size_filter.html#ObservationRescaleToSizeFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationRescaleToSizeFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Rescales an image observation to a given size. The target size does not
|
||||
necessarily keep the aspect ratio of the original observation.
|
||||
Warning: this requires the input observation to be of type uint8 due to scipy requirements!</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>output_observation_space</strong> – the output observation space</li>
|
||||
<li><strong>rescaling_interpolation_type</strong> – the interpolation type for rescaling</li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>output_observation_space</strong> – the output observation space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -398,14 +366,11 @@ The channels axis is assumed to be the last axis</p>
|
||||
<dt id="rl_coach.filters.observation.ObservationSqueezeFilter">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.observation.</code><code class="descname">ObservationSqueezeFilter</code><span class="sig-paren">(</span><em>axis: int = None</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/observation/observation_squeeze_filter.html#ObservationSqueezeFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.observation.ObservationSqueezeFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Removes redundant axes from the observation, which are axes with a dimension of 1.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>axis</strong> – Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>axis</strong> – Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -423,18 +388,14 @@ The filter adds an additional dimension to the output observation.</p>
|
||||
<p>Warning!!! The filter replaces the observation with a LazyStack object, so no filters should be
|
||||
applied after this filter. applying more filters will cause the LazyStack object to be converted to a numpy array
|
||||
and increase the memory footprint.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>stack_size</strong> – the number of previous observations in the stack</li>
|
||||
<li><strong>stacking_axis</strong> – the axis on which to stack the observation on</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>stack_size</strong> – the number of previous observations in the stack</p></li>
|
||||
<li><p><strong>stacking_axis</strong> – the axis on which to stack the observation on</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -446,18 +407,14 @@ and increase the memory footprint.</p>
|
||||
<dd><p>Converts a floating point observation into an unsigned int 8 bit observation. This is
|
||||
mostly useful for reducing memory consumption and is usually used for image observations. The filter will first
|
||||
spread the observation values over the range 0-255 and then discretize them into integer values.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>input_low</strong> – The lowest value currently present in the observation</li>
|
||||
<li><strong>input_high</strong> – The highest value currently present in the observation</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>input_low</strong> – The lowest value currently present in the observation</p></li>
|
||||
<li><p><strong>input_high</strong> – The highest value currently present in the observation</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -471,18 +428,14 @@ spread the observation values over the range 0-255 and then discretize them into
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.reward.</code><code class="descname">RewardClippingFilter</code><span class="sig-paren">(</span><em>clipping_low: float = -inf</em>, <em>clipping_high: float = inf</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/reward/reward_clipping_filter.html#RewardClippingFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.reward.RewardClippingFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Clips the reward values into a given range. For example, in DQN, the Atari rewards are
|
||||
clipped into the range -1 and 1 in order to control the scale of the returns.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>clipping_low</strong> – The low threshold for reward clipping</li>
|
||||
<li><strong>clipping_high</strong> – The high threshold for reward clipping</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>clipping_low</strong> – The low threshold for reward clipping</p></li>
|
||||
<li><p><strong>clipping_high</strong> – The high threshold for reward clipping</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -494,18 +447,14 @@ clipped into the range -1 and 1 in order to control the scale of the returns.</p
|
||||
<dd><p>Normalizes the reward values with a running mean and standard deviation of
|
||||
all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation
|
||||
are accumulated over all the workers.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>clip_min</strong> – The minimum value to allow after normalizing the reward</li>
|
||||
<li><strong>clip_max</strong> – The maximum value to allow after normalizing the reward</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>clip_min</strong> – The minimum value to allow after normalizing the reward</p></li>
|
||||
<li><p><strong>clip_max</strong> – The maximum value to allow after normalizing the reward</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -516,14 +465,11 @@ are accumulated over all the workers.</p>
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.filters.reward.</code><code class="descname">RewardRescaleFilter</code><span class="sig-paren">(</span><em>rescale_factor: float</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/filters/reward/reward_rescale_filter.html#RewardRescaleFilter"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.filters.reward.RewardRescaleFilter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Rescales the reward by a given factor. Rescaling the rewards of the environment has been
|
||||
observed to have a large effect (negative or positive) on the behavior of the learning process.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rescale_factor</strong> – The reward rescaling factor by which the reward will be multiplied</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>rescale_factor</strong> – The reward rescaling factor by which the reward will be multiplied</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -541,7 +487,7 @@ observed to have a large effect (negative or positive) on the behavior of the le
|
||||
<a href="output_filters.html" class="btn btn-neutral float-right" title="Output Filters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="index.html" class="btn btn-neutral" title="Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="index.html" class="btn btn-neutral float-left" title="Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -550,7 +496,7 @@ observed to have a large effect (negative or positive) on the behavior of the le
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -567,27 +513,16 @@ observed to have a large effect (negative or positive) on the behavior of the le
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Output Filters — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Output Filters — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Input Filters" href="input_filters.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -203,18 +206,14 @@ as choosing sub-boxes in a given box. For example, consider an image of size 100
|
||||
a crop window of size 20x20 to attend to in the image. AttentionDiscretization allows discretizing the possible crop
|
||||
windows to choose into a finite number of options, and map a discrete action space into those crop windows.</p>
|
||||
<p>Warning! this will currently only work for attention spaces with 2 dimensions.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_bins_per_dimension</strong> – Number of discrete bins to use for each dimension of the action space</li>
|
||||
<li><strong>force_int_bins</strong> – If set to True, all the bins will represent integer coordinates in space.</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_bins_per_dimension</strong> – Number of discrete bins to use for each dimension of the action space</p></li>
|
||||
<li><p><strong>force_int_bins</strong> – If set to True, all the bins will represent integer coordinates in space.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<img alt="../../_images/attention_discretization.png" class="align-center" src="../../_images/attention_discretization.png" />
|
||||
@@ -227,21 +226,17 @@ original continuous action space is uniformly separated into the given number of
|
||||
action index. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
|
||||
For example, if the original actions space is between -1 and 1 and 5 bins were selected, the new action
|
||||
space will consist of 5 actions mapped to -1, -0.5, 0, 0.5 and 1.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>num_bins_per_dimension</strong> – The number of bins to use for each dimension of the target action space.
|
||||
The bins will be spread out uniformly over this space</li>
|
||||
<li><strong>force_int_bins</strong> – force the bins to represent only integer actions. for example, if the action space is in
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>num_bins_per_dimension</strong> – The number of bins to use for each dimension of the target action space.
|
||||
The bins will be spread out uniformly over this space</p></li>
|
||||
<li><p><strong>force_int_bins</strong> – force the bins to represent only integer actions. for example, if the action space is in
|
||||
the range 0-10 and there are 5 bins, then the bins will be placed at 0, 2, 5, 7, 10,
|
||||
instead of 0, 2.5, 5, 7.5, 10.</li>
|
||||
instead of 0, 2.5, 5, 7.5, 10.</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<img alt="../../_images/box_discretization.png" class="align-center" src="../../_images/box_discretization.png" />
|
||||
@@ -252,18 +247,14 @@ instead of 0, 2.5, 5, 7.5, 10.</li>
|
||||
if the original action space is between -1 and 1, then this filter can be used in order to constrain the agent actions
|
||||
to the range 0 and 1 instead. This essentially masks the range -1 and 0 from the agent.
|
||||
The resulting action space will be shifted and will always start from 0 and have the size of the unmasked area.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>masked_target_space_low</strong> – the lowest values that can be chosen in the target action space</li>
|
||||
<li><strong>masked_target_space_high</strong> – the highest values that can be chosen in the target action space</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>masked_target_space_low</strong> – the lowest values that can be chosen in the target action space</p></li>
|
||||
<li><p><strong>masked_target_space_high</strong> – the highest values that can be chosen in the target action space</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<img alt="../../_images/box_masking.png" class="align-center" src="../../_images/box_masking.png" />
|
||||
@@ -275,18 +266,14 @@ with a MultiSelect action space (select multiple actions at the same time, such
|
||||
MultiSelect actions. If we want the agent to be able to select only 5 of those actions by their index (0-4), we can
|
||||
map a discrete action space with 5 actions into the 5 selected MultiSelect actions. This will both allow the agent to
|
||||
use regular discrete actions, and mask 3 of the actions from the agent.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>target_actions</strong> – A partial list of actions from the target space to map to.</li>
|
||||
<li><strong>descriptions</strong> – a list of descriptions of each of the actions</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>target_actions</strong> – A partial list of actions from the target space to map to.</p></li>
|
||||
<li><p><strong>descriptions</strong> – a list of descriptions of each of the actions</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<img alt="../../_images/partial_discrete_action_space_map.png" class="align-center" src="../../_images/partial_discrete_action_space_map.png" />
|
||||
@@ -309,18 +296,14 @@ environment consists of continuous actions between 0 and 1, and we want the agen
|
||||
the LinearBoxToBoxMap can be used to map the range -1 and 1 to the range 0 and 1 in a linear way. This means that the
|
||||
action -1 will be mapped to 0, the action 1 will be mapped to 1, and the rest of the actions will be linearly mapped
|
||||
between those values.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>input_space_low</strong> – the low values of the desired action space</li>
|
||||
<li><strong>input_space_high</strong> – the high values of the desired action space</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>input_space_low</strong> – the low values of the desired action space</p></li>
|
||||
<li><p><strong>input_space_high</strong> – the high values of the desired action space</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<img alt="../../_images/linear_box_to_box_map.png" class="align-center" src="../../_images/linear_box_to_box_map.png" />
|
||||
@@ -338,7 +321,7 @@ between those values.</p>
|
||||
<a href="../memories/index.html" class="btn btn-neutral float-right" title="Memories" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="input_filters.html" class="btn btn-neutral" title="Input Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="input_filters.html" class="btn btn-neutral float-left" title="Input Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -347,7 +330,7 @@ between those values.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -364,27 +347,16 @@ between those values.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Memories — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Memories — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Output Filters" href="../filters/output_filters.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -210,14 +213,11 @@
|
||||
<dd><p>A replay buffer that stores episodes of transitions. The additional structure allows performing various
|
||||
calculations of total return and other values that depend on the sequential behavior of the transitions
|
||||
in the episode.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -227,22 +227,18 @@ in the episode.</p>
|
||||
<dt id="rl_coach.memories.episodic.EpisodicHindsightExperienceReplay">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.memories.episodic.</code><code class="descname">EpisodicHindsightExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/episodic/episodic_hindsight_experience_replay.html#EpisodicHindsightExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.episodic.EpisodicHindsightExperienceReplay" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Implements Hindsight Experience Replay as described in the following paper: <a class="reference external" href="https://arxiv.org/pdf/1707.01495.pdf">https://arxiv.org/pdf/1707.01495.pdf</a></p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</li>
|
||||
<li><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
|
||||
for each actual transition</li>
|
||||
<li><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
|
||||
hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
|
||||
<li><strong>goals_space</strong> – A GoalsSpace which defines the base properties of the goals space</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</p></li>
|
||||
<li><p><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
|
||||
for each actual transition</p></li>
|
||||
<li><p><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
|
||||
hindsight transitions. Should be one of HindsightGoalSelectionMethod</p></li>
|
||||
<li><p><strong>goals_space</strong> – A GoalsSpace which defines the base properties of the goals space</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -253,23 +249,19 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.memories.episodic.</code><code class="descname">EpisodicHRLHindsightExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/episodic/episodic_hrl_hindsight_experience_replay.html#EpisodicHRLHindsightExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Implements HRL Hindsight Experience Replay as described in the following paper: <a class="reference external" href="https://arxiv.org/abs/1805.08180">https://arxiv.org/abs/1805.08180</a></p>
|
||||
<p>This is the memory you should use if you want a shared hindsight experience replay buffer between multiple workers</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</li>
|
||||
<li><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
|
||||
for each actual transition</li>
|
||||
<li><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
|
||||
hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
|
||||
<li><strong>goals_space</strong> – A GoalsSpace which defines the properties of the goals</li>
|
||||
<li><strong>do_action_hindsight</strong> – Replace the action (sub-goal) given to a lower layer, with the actual achieved goal</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>max_size</strong> – The maximum size of the memory. should be defined in a granularity of Transitions</p></li>
|
||||
<li><p><strong>hindsight_transitions_per_regular_transition</strong> – The number of hindsight artificial transitions to generate
|
||||
for each actual transition</p></li>
|
||||
<li><p><strong>hindsight_goal_selection_method</strong> – The method that will be used for generating the goals for the
|
||||
hindsight transitions. Should be one of HindsightGoalSelectionMethod</p></li>
|
||||
<li><p><strong>goals_space</strong> – A GoalsSpace which defines the properties of the goals</p></li>
|
||||
<li><p><strong>do_action_hindsight</strong> – Replace the action (sub-goal) given to a lower layer, with the actual achieved goal</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -289,21 +281,17 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod</li>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.memories.non_episodic.BalancedExperienceReplay">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.memories.non_episodic.</code><code class="descname">BalancedExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True, num_classes: int = 0, state_key_with_the_class_index: Any = 'class'</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/non_episodic/balanced_experience_replay.html#BalancedExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.non_episodic.BalancedExperienceReplay" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</li>
|
||||
<li><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</li>
|
||||
<li><strong>num_classes</strong> – the number of classes in the replayed data</li>
|
||||
<li><strong>state_key_with_the_class_index</strong> – the class index is assumed to be a value in the state dictionary.
|
||||
this parameter determines the key to retrieve the class index value</li>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p></li>
|
||||
<li><p><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</p></li>
|
||||
<li><p><strong>num_classes</strong> – the number of classes in the replayed data</p></li>
|
||||
<li><p><strong>state_key_with_the_class_index</strong> – the class index is assumed to be a value in the state dictionary.
|
||||
this parameter determines the key to retrieve the class index value</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -321,18 +309,14 @@ this parameter determines the key to retrieve the class index value</li>
|
||||
<dt id="rl_coach.memories.non_episodic.ExperienceReplay">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.memories.non_episodic.</code><code class="descname">ExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/non_episodic/experience_replay.html#ExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.non_episodic.ExperienceReplay" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A regular replay buffer which stores transition without any additional structure</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</li>
|
||||
<li><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p></li>
|
||||
<li><p><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -343,21 +327,17 @@ this parameter determines the key to retrieve the class index value</li>
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.memories.non_episodic.</code><code class="descname">PrioritizedExperienceReplay</code><span class="sig-paren">(</span><em>max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], alpha: float = 0.6, beta: rl_coach.schedules.Schedule = <rl_coach.schedules.ConstantSchedule object>, epsilon: float = 1e-06, allow_duplicates_in_batch_sampling: bool = True</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/non_episodic/prioritized_experience_replay.html#PrioritizedExperienceReplay"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.non_episodic.PrioritizedExperienceReplay" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>This is the proportional sampling variant of the prioritized experience replay as described
|
||||
in <a class="reference external" href="https://arxiv.org/pdf/1511.05952.pdf">https://arxiv.org/pdf/1511.05952.pdf</a>.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</li>
|
||||
<li><strong>alpha</strong> – the alpha prioritization coefficient</li>
|
||||
<li><strong>beta</strong> – the beta parameter used for importance sampling</li>
|
||||
<li><strong>epsilon</strong> – a small value added to the priority of each transition</li>
|
||||
<li><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>max_size</strong> – the maximum number of transitions or episodes to hold in the memory</p></li>
|
||||
<li><p><strong>alpha</strong> – the alpha prioritization coefficient</p></li>
|
||||
<li><p><strong>beta</strong> – the beta parameter used for importance sampling</p></li>
|
||||
<li><p><strong>epsilon</strong> – a small value added to the priority of each transition</p></li>
|
||||
<li><p><strong>allow_duplicates_in_batch_sampling</strong> – allow having the same transition multiple times in a batch</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -385,7 +365,7 @@ are constructed on top of.</p>
|
||||
<a href="../memory_backends/index.html" class="btn btn-neutral float-right" title="Memory Backends" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../filters/output_filters.html" class="btn btn-neutral" title="Output Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../filters/output_filters.html" class="btn btn-neutral float-left" title="Output Filters" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -394,7 +374,7 @@ are constructed on top of.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -411,27 +391,16 @@ are constructed on top of.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Memory Backends — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Memory Backends — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Memories" href="../memories/index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -193,14 +196,11 @@
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.memories.backend.redis.</code><code class="descname">RedisPubSubBackend</code><span class="sig-paren">(</span><em>params: rl_coach.memories.backend.redis.RedisPubSubMemoryBackendParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/memories/backend/redis.html#RedisPubSubBackend"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.memories.backend.redis.RedisPubSubBackend" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A memory backend which transfers the experiences from the rollout to the training worker using Redis Pub/Sub in
|
||||
Coach when distributed mode is used.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The Redis parameters to be used with this Redis Pub/Sub instance.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>params</strong> – The Redis parameters to be used with this Redis Pub/Sub instance.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</div>
|
||||
@@ -217,7 +217,7 @@ Coach when distributed mode is used.</p>
|
||||
<a href="../orchestrators/index.html" class="btn btn-neutral float-right" title="Orchestrators" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../memories/index.html" class="btn btn-neutral" title="Memories" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../memories/index.html" class="btn btn-neutral float-left" title="Memories" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -226,7 +226,7 @@ Coach when distributed mode is used.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -243,27 +243,16 @@ Coach when distributed mode is used.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Orchestrators — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Orchestrators — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Memory Backends" href="../memory_backends/index.html" />
|
||||
<link href="../../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -193,14 +196,11 @@
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.orchestrators.kubernetes_orchestrator.</code><code class="descname">Kubernetes</code><span class="sig-paren">(</span><em>params: rl_coach.orchestrators.kubernetes_orchestrator.KubernetesParameters</em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/rl_coach/orchestrators/kubernetes_orchestrator.html#Kubernetes"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>An orchestrator implmentation which uses Kubernetes to deploy the components such as training and rollout workers
|
||||
and Redis Pub/Sub in Coach when used in the distributed mode.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>params</strong> – The Kubernetes parameters which are used for deploying the components in Coach. These parameters</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>params</strong> – The Kubernetes parameters which are used for deploying the components in Coach. These parameters</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<p>include namespace and kubeconfig.</p>
|
||||
</dd></dl>
|
||||
|
||||
@@ -218,7 +218,7 @@ and Redis Pub/Sub in Coach when used in the distributed mode.</p>
|
||||
<a href="../core_types.html" class="btn btn-neutral float-right" title="Core Types" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="../memory_backends/index.html" class="btn btn-neutral" title="Memory Backends" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="../memory_backends/index.html" class="btn btn-neutral float-left" title="Memory Backends" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -227,7 +227,7 @@ and Redis Pub/Sub in Coach when used in the distributed mode.</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -244,27 +244,16 @@ and Redis Pub/Sub in Coach when used in the distributed mode.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Spaces — Reinforcement Learning Coach 0.11.0 documentation</title>
|
||||
<title>Spaces — Reinforcement Learning Coach 0.12.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@@ -17,13 +17,21 @@
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../_static/js/modernizr.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
<script type="text/javascript" src="../_static/js/theme.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
|
||||
<link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
|
||||
@@ -33,21 +41,16 @@
|
||||
<link rel="prev" title="Core Types" href="core_types.html" />
|
||||
<link href="../_static/css/custom.css" rel="stylesheet" type="text/css">
|
||||
|
||||
|
||||
|
||||
<script src="../_static/js/modernizr.min.js"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
|
||||
|
||||
<div class="wy-grid-for-nav">
|
||||
|
||||
|
||||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||||
<div class="wy-side-scroll">
|
||||
<div class="wy-side-nav-search">
|
||||
<div class="wy-side-nav-search" >
|
||||
|
||||
|
||||
|
||||
@@ -207,52 +210,44 @@
|
||||
<dt id="rl_coach.spaces.Space">
|
||||
<em class="property">class </em><code class="descclassname">rl_coach.spaces.</code><code class="descname">Space</code><span class="sig-paren">(</span><em>shape: Union[int, tuple, list, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/spaces.html#Space"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A space defines a set of valid values</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>shape</strong> – the shape of the space</li>
|
||||
<li><strong>low</strong> – the lowest values possible in the space. can be an array defining the lowest values per point,
|
||||
or a single value defining the general lowest values</li>
|
||||
<li><strong>high</strong> – the highest values possible in the space. can be an array defining the highest values per point,
|
||||
or a single value defining the general highest values</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>shape</strong> – the shape of the space</p></li>
|
||||
<li><p><strong>low</strong> – the lowest values possible in the space. can be an array defining the lowest values per point,
|
||||
or a single value defining the general lowest values</p></li>
|
||||
<li><p><strong>high</strong> – the highest values possible in the space. can be an array defining the highest values per point,
|
||||
or a single value defining the general highest values</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.Space.contains">
|
||||
<code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> → bool<a class="reference internal" href="../_modules/rl_coach/spaces.html#Space.contains"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space.contains" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if value is contained by this space. The shape must match and
|
||||
all of the values must be within the low and high bounds.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.Space.is_valid_index">
|
||||
<code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> → bool<a class="reference internal" href="../_modules/rl_coach/spaces.html#Space.is_valid_index"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space.is_valid_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -260,14 +255,11 @@ all of the values must be within the low and high bounds.</p>
|
||||
<code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → numpy.ndarray<a class="reference internal" href="../_modules/rl_coach/spaces.html#Space.sample"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.Space.sample" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
|
||||
bounds are defined</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A numpy array sampled from the space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -283,32 +275,28 @@ bounds are defined</p>
|
||||
<code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> → bool<a class="headerlink" href="#rl_coach.spaces.ObservationSpace.contains" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if value is contained by this space. The shape must match and
|
||||
all of the values must be within the low and high bounds.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.ObservationSpace.is_valid_index">
|
||||
<code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> → bool<a class="headerlink" href="#rl_coach.spaces.ObservationSpace.is_valid_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -316,14 +304,11 @@ all of the values must be within the low and high bounds.</p>
|
||||
<code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → numpy.ndarray<a class="headerlink" href="#rl_coach.spaces.ObservationSpace.sample" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
|
||||
bounds are defined</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A numpy array sampled from the space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -368,16 +353,14 @@ represent a RGB image, or a grayscale image.</p>
|
||||
<dt id="rl_coach.spaces.ActionSpace.clip_action_to_space">
|
||||
<code class="descname">clip_action_to_space</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List]</em><span class="sig-paren">)</span> → Union[int, float, numpy.ndarray, List]<a class="reference internal" href="../_modules/rl_coach/spaces.html#ActionSpace.clip_action_to_space"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.ActionSpace.clip_action_to_space" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given an action, clip its values to fit to the action space ranges</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – a given action</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the clipped action</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – a given action</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the clipped action</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -385,32 +368,28 @@ represent a RGB image, or a grayscale image.</p>
|
||||
<code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> → bool<a class="headerlink" href="#rl_coach.spaces.ActionSpace.contains" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if value is contained by this space. The shape must match and
|
||||
all of the values must be within the low and high bounds.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.ActionSpace.is_valid_index">
|
||||
<code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> → bool<a class="headerlink" href="#rl_coach.spaces.ActionSpace.is_valid_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -418,28 +397,22 @@ all of the values must be within the low and high bounds.</p>
|
||||
<code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → numpy.ndarray<a class="headerlink" href="#rl_coach.spaces.ActionSpace.sample" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
|
||||
bounds are defined</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A numpy array sampled from the space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.ActionSpace.sample_with_info">
|
||||
<code class="descname">sample_with_info</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → rl_coach.core_types.ActionInfo<a class="reference internal" href="../_modules/rl_coach/spaces.html#ActionSpace.sample_with_info"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.ActionSpace.sample_with_info" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get a random action with additional “fake” info</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">An action info instance</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>An action info instance</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -505,21 +478,17 @@ by itself an action space. In Starcraft, the arguments are Discrete action space
|
||||
agents can use it as an output action space.
|
||||
The class acts as a wrapper to the target space. So after setting the target space, all the values of the class
|
||||
will match the values of the target space (the shape, low, high, etc.)</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
|
||||
<li><strong>goal_name</strong> – the name of the observation space to use as the achieved goal.</li>
|
||||
<li><strong>reward_type</strong> – the reward type to use for converting distances from goal to rewards</li>
|
||||
<li><strong>distance_metric</strong> – the distance metric to use. could be either one of the distances in the
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>goal_name</strong> – the name of the observation space to use as the achieved goal.</p></li>
|
||||
<li><p><strong>reward_type</strong> – the reward type to use for converting distances from goal to rewards</p></li>
|
||||
<li><p><strong>distance_metric</strong> – the distance metric to use. could be either one of the distances in the
|
||||
DistanceMetric enum, or a custom function that gets two vectors as input and
|
||||
returns the distance between them</li>
|
||||
returns the distance between them</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="class">
|
||||
<dt id="rl_coach.spaces.GoalsSpace.DistanceMetric">
|
||||
<em class="property">class </em><code class="descname">DistanceMetric</code><a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.DistanceMetric"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.DistanceMetric" title="Permalink to this definition">¶</a></dt>
|
||||
@@ -530,16 +499,14 @@ returns the distance between them</li>
|
||||
<dt id="rl_coach.spaces.GoalsSpace.clip_action_to_space">
|
||||
<code class="descname">clip_action_to_space</code><span class="sig-paren">(</span><em>action: Union[int, float, numpy.ndarray, List]</em><span class="sig-paren">)</span> → Union[int, float, numpy.ndarray, List]<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.clip_action_to_space" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given an action, clip its values to fit to the action space ranges</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>action</strong> – a given action</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the clipped action</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>action</strong> – a given action</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the clipped action</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -547,90 +514,76 @@ returns the distance between them</li>
|
||||
<code class="descname">contains</code><span class="sig-paren">(</span><em>val: Union[int, float, numpy.ndarray]</em><span class="sig-paren">)</span> → bool<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.contains" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if value is contained by this space. The shape must match and
|
||||
all of the values must be within the low and high bounds.</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>val</strong> – a value to check</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True / False depending on if the val matches the space definition</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>val</strong> – a value to check</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True / False depending on if the val matches the space definition</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.GoalsSpace.distance_from_goal">
|
||||
<code class="descname">distance_from_goal</code><span class="sig-paren">(</span><em>goal: numpy.ndarray</em>, <em>state: dict</em><span class="sig-paren">)</span> → float<a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.distance_from_goal"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.distance_from_goal" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a state, check its distance from the goal</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>goal</strong> – a numpy array representing the goal</li>
|
||||
<li><strong>state</strong> – a dict representing the state</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>goal</strong> – a numpy array representing the goal</p></li>
|
||||
<li><p><strong>state</strong> – a dict representing the state</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the distance from the goal</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the distance from the goal</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.GoalsSpace.get_reward_for_goal_and_state">
|
||||
<code class="descname">get_reward_for_goal_and_state</code><span class="sig-paren">(</span><em>goal: numpy.ndarray</em>, <em>state: dict</em><span class="sig-paren">)</span> → Tuple[float, bool]<a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.get_reward_for_goal_and_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.get_reward_for_goal_and_state" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a state, check if the goal was reached and return a reward accordingly</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
|
||||
<li><strong>goal</strong> – a numpy array representing the goal</li>
|
||||
<li><strong>state</strong> – a dict representing the state</li>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>goal</strong> – a numpy array representing the goal</p></li>
|
||||
<li><p><strong>state</strong> – a dict representing the state</p></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">the reward for the current goal and state pair and a boolean representing if the goal was reached</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the reward for the current goal and state pair and a boolean representing if the goal was reached</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.GoalsSpace.goal_from_state">
|
||||
<code class="descname">goal_from_state</code><span class="sig-paren">(</span><em>state: Dict</em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/rl_coach/spaces.html#GoalsSpace.goal_from_state"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#rl_coach.spaces.GoalsSpace.goal_from_state" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Given a state, extract an observation according to the goal_name</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>state</strong> – a dictionary of observations</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">the observation corresponding to the goal_name</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>state</strong> – a dictionary of observations</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the observation corresponding to the goal_name</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.GoalsSpace.is_valid_index">
|
||||
<code class="descname">is_valid_index</code><span class="sig-paren">(</span><em>index: numpy.ndarray</em><span class="sig-paren">)</span> → bool<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.is_valid_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Checks if a given multidimensional index is within the bounds of the shape of the space</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>index</strong> – a multidimensional index</td>
|
||||
</tr>
|
||||
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">True if the index is within the shape of the space. False otherwise</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>index</strong> – a multidimensional index</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True if the index is within the shape of the space. False otherwise</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
@@ -638,28 +591,22 @@ all of the values must be within the low and high bounds.</p>
|
||||
<code class="descname">sample</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → numpy.ndarray<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.sample" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
|
||||
bounds are defined</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">A numpy array sampled from the space</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>A numpy array sampled from the space</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="method">
|
||||
<dt id="rl_coach.spaces.GoalsSpace.sample_with_info">
|
||||
<code class="descname">sample_with_info</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → rl_coach.core_types.ActionInfo<a class="headerlink" href="#rl_coach.spaces.GoalsSpace.sample_with_info" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Get a random action with additional “fake” info</p>
|
||||
<table class="docutils field-list" frame="void" rules="none">
|
||||
<col class="field-name" />
|
||||
<col class="field-body" />
|
||||
<tbody valign="top">
|
||||
<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">An action info instance</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>An action info instance</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
@@ -678,7 +625,7 @@ bounds are defined</p>
|
||||
<a href="additional_parameters.html" class="btn btn-neutral float-right" title="Additional Parameters" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
|
||||
|
||||
|
||||
<a href="core_types.html" class="btn btn-neutral" title="Core Types" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
<a href="core_types.html" class="btn btn-neutral float-left" title="Core Types" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -687,7 +634,7 @@ bounds are defined</p>
|
||||
|
||||
<div role="contentinfo">
|
||||
<p>
|
||||
© Copyright 2018, Intel AI Lab
|
||||
© Copyright 2018-2019, Intel AI Lab
|
||||
|
||||
</p>
|
||||
</div>
|
||||
@@ -704,27 +651,16 @@ bounds are defined</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
||||
<script type="text/javascript" src="../_static/jquery.js"></script>
|
||||
<script type="text/javascript" src="../_static/underscore.js"></script>
|
||||
<script type="text/javascript" src="../_static/doctools.js"></script>
|
||||
<script type="text/javascript" src="../_static/language_data.js"></script>
|
||||
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript" src="../_static/js/theme.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
jQuery(function () {
|
||||
SphinxRtdTheme.Navigation.enable(true);
|
||||
});
|
||||
</script>
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user