1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 03:30:19 +01:00
This commit is contained in:
Gal Leibovich
2019-06-16 11:11:21 +03:00
committed by GitHub
parent 8df3c46756
commit 7eb884c5b2
107 changed files with 2200 additions and 495 deletions

View File

@@ -202,24 +202,27 @@
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ContinuousActionExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.schedules</span> <span class="k">import</span> <span class="n">Schedule</span><span class="p">,</span> <span class="n">LinearSchedule</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">BoxActionSpace</span>
<span class="c1"># TODO: consider renaming to gaussian sampling</span>
<span class="k">class</span> <span class="nc">AdditiveNoiseParameters</span><span class="p">(</span><span class="n">ExplorationParameters</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise_percentage</span> <span class="o">=</span> <span class="mf">0.05</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise</span> <span class="o">=</span> <span class="mf">0.05</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_as_percentage_from_action_space</span> <span class="o">=</span> <span class="kc">True</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">path</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="s1">&#39;rl_coach.exploration_policies.additive_noise:AdditiveNoise&#39;</span>
<div class="viewcode-block" id="AdditiveNoise"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.additive_noise.AdditiveNoise">[docs]</a><span class="k">class</span> <span class="nc">AdditiveNoise</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<div class="viewcode-block" id="AdditiveNoise"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.additive_noise.AdditiveNoise">[docs]</a><span class="k">class</span> <span class="nc">AdditiveNoise</span><span class="p">(</span><span class="n">ContinuousActionExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent</span>
<span class="sd"> and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that</span>
@@ -228,17 +231,19 @@
<span class="sd"> 2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to</span>
<span class="sd"> be the mean of the action, and 2nd is assumed to be its standard deviation.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">:</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">noise_percentage_schedule</span><span class="p">:</span> <span class="n">Schedule</span><span class="p">,</span>
<span class="n">evaluation_noise_percentage</span><span class="p">:</span> <span class="nb">float</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">:</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">noise_schedule</span><span class="p">:</span> <span class="n">Schedule</span><span class="p">,</span>
<span class="n">evaluation_noise</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">noise_as_percentage_from_action_space</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :param action_space: the action space used by the environment</span>
<span class="sd"> :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range</span>
<span class="sd"> of the action space</span>
<span class="sd"> :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases</span>
<span class="sd"> :param noise_schedule: the schedule for the noise</span>
<span class="sd"> :param evaluation_noise: the noise variance that will be used during evaluation phases</span>
<span class="sd"> :param noise_as_percentage_from_action_space: a bool deciding whether the noise is absolute or as a percentage</span>
<span class="sd"> from the action space</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span> <span class="o">=</span> <span class="n">noise_percentage_schedule</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise_percentage</span> <span class="o">=</span> <span class="n">evaluation_noise_percentage</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span> <span class="o">=</span> <span class="n">noise_schedule</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise</span> <span class="o">=</span> <span class="n">evaluation_noise</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_as_percentage_from_action_space</span> <span class="o">=</span> <span class="n">noise_as_percentage_from_action_space</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_space</span><span class="p">,</span> <span class="n">BoxActionSpace</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Additive noise exploration works only for continuous controls.&quot;</span>
@@ -248,19 +253,20 @@
<span class="ow">or</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">all</span><span class="p">(</span><span class="o">-</span><span class="n">np</span><span class="o">.</span><span class="n">inf</span> <span class="o">&lt;</span> <span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">all</span><span class="p">(</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span> <span class="o">&lt;</span> <span class="n">np</span><span class="o">.</span><span class="n">inf</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Additive noise exploration requires bounded actions&quot;</span><span class="p">)</span>
<span class="c1"># TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="c1"># TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies</span>
<span class="c1"># set the current noise percentage</span>
<span class="c1"># set the current noise</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span><span class="p">:</span>
<span class="n">current_noise_precentage</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise_percentage</span>
<span class="n">current_noise</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">current_noise_precentage</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span><span class="o">.</span><span class="n">current_value</span>
<span class="n">current_noise</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span><span class="o">.</span><span class="n">current_value</span>
<span class="c1"># scale the noise to the action space range</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise_precentage</span> <span class="o">*</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">high</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">noise_as_percentage_from_action_space</span><span class="p">:</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise</span> <span class="o">*</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">high</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise</span>
<span class="c1"># extract the mean values</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_values</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
@@ -272,18 +278,21 @@
<span class="c1"># step the noise schedule</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="c1"># the second element of the list is assumed to be the standard deviation</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_values</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">action_values</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">squeeze</span><span class="p">()</span>
<span class="c1"># add noise to the action means</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">action_values_mean</span><span class="p">,</span> <span class="n">action_values_std</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span><span class="p">:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">action_values_mean</span><span class="p">,</span> <span class="n">action_values_std</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">action_values_mean</span>
<span class="k">return</span> <span class="n">action</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">atleast_1d</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_control_param</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span><span class="o">.</span><span class="n">current_value</span></div>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span><span class="o">.</span><span class="n">current_value</span></div>
</pre></div>
</div>

View File

@@ -202,7 +202,7 @@
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">DiscreteActionExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.schedules</span> <span class="k">import</span> <span class="n">Schedule</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span>
@@ -217,8 +217,7 @@
<span class="k">return</span> <span class="s1">&#39;rl_coach.exploration_policies.boltzmann:Boltzmann&#39;</span>
<div class="viewcode-block" id="Boltzmann"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.boltzmann.Boltzmann">[docs]</a><span class="k">class</span> <span class="nc">Boltzmann</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<div class="viewcode-block" id="Boltzmann"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.boltzmann.Boltzmann">[docs]</a><span class="k">class</span> <span class="nc">Boltzmann</span><span class="p">(</span><span class="n">DiscreteActionExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible</span>
<span class="sd"> actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values</span>
@@ -233,7 +232,7 @@
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">temperature_schedule</span> <span class="o">=</span> <span class="n">temperature_schedule</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="p">(</span><span class="n">ActionType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TRAIN</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">temperature_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="c1"># softmax calculation</span>
@@ -242,7 +241,8 @@
<span class="c1"># make sure probs sum to 1</span>
<span class="n">probabilities</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">-</span> <span class="n">np</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">probabilities</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
<span class="c1"># choose actions according to the probabilities</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">),</span> <span class="n">p</span><span class="o">=</span><span class="n">probabilities</span><span class="p">)</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">),</span> <span class="n">p</span><span class="o">=</span><span class="n">probabilities</span><span class="p">)</span>
<span class="k">return</span> <span class="n">action</span><span class="p">,</span> <span class="n">probabilities</span>
<span class="k">def</span> <span class="nf">get_control_param</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">temperature_schedule</span><span class="o">.</span><span class="n">current_value</span></div>

View File

@@ -202,7 +202,7 @@
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">DiscreteActionExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span>
@@ -212,7 +212,7 @@
<span class="k">return</span> <span class="s1">&#39;rl_coach.exploration_policies.categorical:Categorical&#39;</span>
<div class="viewcode-block" id="Categorical"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.categorical.Categorical">[docs]</a><span class="k">class</span> <span class="nc">Categorical</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<div class="viewcode-block" id="Categorical"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.categorical.Categorical">[docs]</a><span class="k">class</span> <span class="nc">Categorical</span><span class="p">(</span><span class="n">DiscreteActionExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Categorical exploration policy is intended for discrete action spaces. It expects the action values to</span>
<span class="sd"> represent a probability distribution over the action, from which a single action will be sampled.</span>
@@ -225,13 +225,18 @@
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="p">(</span><span class="n">ActionType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TRAIN</span><span class="p">:</span>
<span class="c1"># choose actions according to the probabilities</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="n">action_values</span><span class="p">)</span>
<span class="k">return</span> <span class="n">action</span><span class="p">,</span> <span class="n">action_values</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># take the action with the highest probability</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">one_hot_action_probabilities</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">))</span>
<span class="n">one_hot_action_probabilities</span><span class="p">[</span><span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">action</span><span class="p">,</span> <span class="n">one_hot_action_probabilities</span>
<span class="k">def</span> <span class="nf">get_control_param</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="mi">0</span></div>

View File

@@ -203,8 +203,7 @@
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.additive_noise</span> <span class="k">import</span> <span class="n">AdditiveNoiseParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationParameters</span><span class="p">,</span> <span class="n">ExplorationPolicy</span>
<span class="kn">from</span> <span class="nn">rl_coach.schedules</span> <span class="k">import</span> <span class="n">Schedule</span><span class="p">,</span> <span class="n">LinearSchedule</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">DiscreteActionSpace</span><span class="p">,</span> <span class="n">BoxActionSpace</span>
<span class="kn">from</span> <span class="nn">rl_coach.utils</span> <span class="k">import</span> <span class="n">dynamic_import_and_instantiate_module_from_params</span>
@@ -216,7 +215,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.01</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_epsilon</span> <span class="o">=</span> <span class="mf">0.05</span>
<span class="bp">self</span><span class="o">.</span><span class="n">continuous_exploration_policy_parameters</span> <span class="o">=</span> <span class="n">AdditiveNoiseParameters</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">continuous_exploration_policy_parameters</span><span class="o">.</span><span class="n">noise_percentage_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">continuous_exploration_policy_parameters</span><span class="o">.</span><span class="n">noise_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="c1"># for continuous control -</span>
<span class="c1"># (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)</span>
@@ -265,26 +264,32 @@
<span class="n">epsilon</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">evaluation_epsilon</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon_schedule</span><span class="o">.</span><span class="n">current_value</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_random_value</span> <span class="o">&gt;=</span> <span class="n">epsilon</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="p">(</span><span class="n">ActionType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]):</span>
<span class="n">epsilon</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">evaluation_epsilon</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon_schedule</span><span class="o">.</span><span class="n">current_value</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">,</span> <span class="n">DiscreteActionSpace</span><span class="p">):</span>
<span class="n">top_action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_random_value</span> <span class="o">&lt;</span> <span class="n">epsilon</span><span class="p">:</span>
<span class="n">chosen_action</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
<span class="n">probabilities</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">full</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">),</span>
<span class="mf">1.</span> <span class="o">/</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">high</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="mi">1</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">chosen_action</span> <span class="o">=</span> <span class="n">top_action</span>
<span class="n">chosen_action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="c1"># one-hot probabilities vector</span>
<span class="n">probabilities</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">))</span>
<span class="n">probabilities</span><span class="p">[</span><span class="n">chosen_action</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="bp">self</span><span class="o">.</span><span class="n">step_epsilon</span><span class="p">()</span>
<span class="k">return</span> <span class="n">chosen_action</span><span class="p">,</span> <span class="n">probabilities</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_random_value</span> <span class="o">&lt;</span> <span class="n">epsilon</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TRAIN</span><span class="p">:</span>
<span class="n">chosen_action</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">chosen_action</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">continuous_exploration_policy</span><span class="o">.</span><span class="n">get_action</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="c1"># step the epsilon schedule and generate a new random value for next time</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TRAIN</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">current_random_value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">()</span>
<span class="k">return</span> <span class="n">chosen_action</span>
<span class="bp">self</span><span class="o">.</span><span class="n">step_epsilon</span><span class="p">()</span>
<span class="k">return</span> <span class="n">chosen_action</span>
<span class="k">def</span> <span class="nf">get_control_param</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">,</span> <span class="n">DiscreteActionSpace</span><span class="p">):</span>
@@ -295,7 +300,13 @@
<span class="k">def</span> <span class="nf">change_phase</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">phase</span><span class="p">):</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">change_phase</span><span class="p">(</span><span class="n">phase</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">,</span> <span class="n">BoxActionSpace</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">continuous_exploration_policy</span><span class="o">.</span><span class="n">change_phase</span><span class="p">(</span><span class="n">phase</span><span class="p">)</span></div>
<span class="bp">self</span><span class="o">.</span><span class="n">continuous_exploration_policy</span><span class="o">.</span><span class="n">change_phase</span><span class="p">(</span><span class="n">phase</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">step_epsilon</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="c1"># step the epsilon schedule and generate a new random value for next time</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TRAIN</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">current_random_value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">()</span></div>
</pre></div>
</div>

View File

@@ -201,7 +201,7 @@
<span class="kn">from</span> <span class="nn">rl_coach.base_parameters</span> <span class="k">import</span> <span class="n">Parameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">DiscreteActionSpace</span><span class="p">,</span> <span class="n">BoxActionSpace</span><span class="p">,</span> <span class="n">GoalsSpace</span>
<span class="k">class</span> <span class="nc">ExplorationParameters</span><span class="p">(</span><span class="n">Parameters</span><span class="p">):</span>
@@ -237,14 +237,10 @@
<span class="sd"> Given a list of values corresponding to each action, </span>
<span class="sd"> choose one actions according to the exploration policy</span>
<span class="sd"> :param action_values: A list of action values</span>
<span class="sd"> :return: The chosen action</span>
<span class="sd"> :return: The chosen action,</span>
<span class="sd"> The probability of the action (if available, otherwise 1 for absolute certainty in the action)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span> <span class="o">==</span> <span class="n">ExplorationPolicy</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The ExplorationPolicy class is an abstract class and should not be used directly. &quot;</span>
<span class="s2">&quot;Please set the exploration parameters to point to an inheriting class like EGreedy or &quot;</span>
<span class="s2">&quot;AdditiveNoise&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The get_action function should be overridden in the inheriting exploration class&quot;</span><span class="p">)</span></div>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">()</span></div>
<div class="viewcode-block" id="ExplorationPolicy.change_phase"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.exploration_policy.ExplorationPolicy.change_phase">[docs]</a> <span class="k">def</span> <span class="nf">change_phase</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">phase</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
@@ -265,6 +261,45 @@
<span class="k">def</span> <span class="nf">get_control_param</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="mi">0</span></div>
<span class="k">class</span> <span class="nc">DiscreteActionExplorationPolicy</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A discrete action exploration policy.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">:</span> <span class="n">ActionSpace</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :param action_space: the action space used by the environment</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_space</span><span class="p">,</span> <span class="n">DiscreteActionSpace</span><span class="p">)</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="p">(</span><span class="n">ActionType</span><span class="p">,</span> <span class="n">List</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Given a list of values corresponding to each action,</span>
<span class="sd"> choose one actions according to the exploration policy</span>
<span class="sd"> :param action_values: A list of action values</span>
<span class="sd"> :return: The chosen action,</span>
<span class="sd"> The probabilities of actions to select from (if not available a one-hot vector)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span> <span class="o">==</span> <span class="n">ExplorationPolicy</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The ExplorationPolicy class is an abstract class and should not be used directly. &quot;</span>
<span class="s2">&quot;Please set the exploration parameters to point to an inheriting class like EGreedy or &quot;</span>
<span class="s2">&quot;AdditiveNoise&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The get_action function should be overridden in the inheriting exploration class&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">ContinuousActionExplorationPolicy</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A continuous action exploration policy.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">:</span> <span class="n">ActionSpace</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :param action_space: the action space used by the environment</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_space</span><span class="p">,</span> <span class="n">BoxActionSpace</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_space</span><span class="p">,</span> <span class="n">GoalsSpace</span><span class="p">)</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
</pre></div>
</div>

View File

@@ -202,7 +202,7 @@
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationParameters</span><span class="p">,</span> <span class="n">ExplorationPolicy</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">DiscreteActionSpace</span><span class="p">,</span> <span class="n">BoxActionSpace</span>
@@ -224,9 +224,12 @@
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">]):</span>
<span class="k">if</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">)</span> <span class="o">==</span> <span class="n">DiscreteActionSpace</span><span class="p">:</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">one_hot_action_probabilities</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">))</span>
<span class="n">one_hot_action_probabilities</span><span class="p">[</span><span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">action</span><span class="p">,</span> <span class="n">one_hot_action_probabilities</span>
<span class="k">if</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">)</span> <span class="o">==</span> <span class="n">BoxActionSpace</span><span class="p">:</span>
<span class="k">return</span> <span class="n">action_values</span>

View File

@@ -202,12 +202,13 @@
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ContinuousActionExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">BoxActionSpace</span><span class="p">,</span> <span class="n">GoalsSpace</span>
<span class="c1"># Based on on the description in:</span>
<span class="c1"># https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab</span>
<span class="k">class</span> <span class="nc">OUProcessParameters</span><span class="p">(</span><span class="n">ExplorationParameters</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
@@ -222,7 +223,7 @@
<span class="c1"># Ornstein-Uhlenbeck process</span>
<div class="viewcode-block" id="OUProcess"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.ou_process.OUProcess">[docs]</a><span class="k">class</span> <span class="nc">OUProcess</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<div class="viewcode-block" id="OUProcess"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.ou_process.OUProcess">[docs]</a><span class="k">class</span> <span class="nc">OUProcess</span><span class="p">(</span><span class="n">ContinuousActionExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> OUProcess exploration policy is intended for continuous action spaces, and selects the action according to</span>
<span class="sd"> an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where</span>
@@ -239,10 +240,6 @@
<span class="bp">self</span><span class="o">.</span><span class="n">state</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">dt</span> <span class="o">=</span> <span class="n">dt</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">action_space</span><span class="p">,</span> <span class="n">BoxActionSpace</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_space</span><span class="p">,</span> <span class="n">GoalsSpace</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;OU process exploration works only for continuous controls.&quot;</span>
<span class="s2">&quot;The given action space is of type: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">action_space</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">reset</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">state</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span>

View File

@@ -242,9 +242,13 @@
<span class="bp">self</span><span class="o">.</span><span class="n">network_params</span> <span class="o">=</span> <span class="n">network_params</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_replace_network_dense_layers</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">]):</span>
<span class="k">if</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">)</span> <span class="o">==</span> <span class="n">DiscreteActionSpace</span><span class="p">:</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span>
<span class="n">one_hot_action_probabilities</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">actions</span><span class="p">))</span>
<span class="n">one_hot_action_probabilities</span><span class="p">[</span><span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">action</span><span class="p">,</span> <span class="n">one_hot_action_probabilities</span>
<span class="k">elif</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="p">)</span> <span class="o">==</span> <span class="n">BoxActionSpace</span><span class="p">:</span>
<span class="n">action_values_mean</span> <span class="o">=</span> <span class="n">action_values</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">squeeze</span><span class="p">()</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">action_values</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">squeeze</span><span class="p">()</span>

View File

@@ -203,7 +203,7 @@
<span class="kn">from</span> <span class="nn">scipy.stats</span> <span class="k">import</span> <span class="n">truncnorm</span>
<span class="kn">from</span> <span class="nn">rl_coach.core_types</span> <span class="k">import</span> <span class="n">RunPhase</span><span class="p">,</span> <span class="n">ActionType</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationPolicy</span><span class="p">,</span> <span class="n">ExplorationParameters</span>
<span class="kn">from</span> <span class="nn">rl_coach.exploration_policies.exploration_policy</span> <span class="k">import</span> <span class="n">ExplorationParameters</span><span class="p">,</span> <span class="n">ContinuousActionExplorationPolicy</span>
<span class="kn">from</span> <span class="nn">rl_coach.schedules</span> <span class="k">import</span> <span class="n">Schedule</span><span class="p">,</span> <span class="n">LinearSchedule</span>
<span class="kn">from</span> <span class="nn">rl_coach.spaces</span> <span class="k">import</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">BoxActionSpace</span>
@@ -211,17 +211,18 @@
<span class="k">class</span> <span class="nc">TruncatedNormalParameters</span><span class="p">(</span><span class="n">ExplorationParameters</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise_percentage</span> <span class="o">=</span> <span class="mf">0.05</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span> <span class="o">=</span> <span class="n">LinearSchedule</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mi">50000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise</span> <span class="o">=</span> <span class="mf">0.05</span>
<span class="bp">self</span><span class="o">.</span><span class="n">clip_low</span> <span class="o">=</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">clip_high</span> <span class="o">=</span> <span class="mi">1</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_as_percentage_from_action_space</span> <span class="o">=</span> <span class="kc">True</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">path</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="s1">&#39;rl_coach.exploration_policies.truncated_normal:TruncatedNormal&#39;</span>
<div class="viewcode-block" id="TruncatedNormal"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.truncated_normal.TruncatedNormal">[docs]</a><span class="k">class</span> <span class="nc">TruncatedNormal</span><span class="p">(</span><span class="n">ExplorationPolicy</span><span class="p">):</span>
<div class="viewcode-block" id="TruncatedNormal"><a class="viewcode-back" href="../../../components/exploration_policies/index.html#rl_coach.exploration_policies.truncated_normal.TruncatedNormal">[docs]</a><span class="k">class</span> <span class="nc">TruncatedNormal</span><span class="p">(</span><span class="n">ContinuousActionExplorationPolicy</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a</span>
<span class="sd"> normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t</span>
@@ -232,17 +233,20 @@
<span class="sd"> When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it</span>
<span class="sd"> is within the bounds.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">:</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">noise_percentage_schedule</span><span class="p">:</span> <span class="n">Schedule</span><span class="p">,</span>
<span class="n">evaluation_noise_percentage</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">clip_low</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">clip_high</span><span class="p">:</span> <span class="nb">float</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">:</span> <span class="n">ActionSpace</span><span class="p">,</span> <span class="n">noise_schedule</span><span class="p">:</span> <span class="n">Schedule</span><span class="p">,</span>
<span class="n">evaluation_noise</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">clip_low</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">clip_high</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">noise_as_percentage_from_action_space</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :param action_space: the action space used by the environment</span>
<span class="sd"> :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range</span>
<span class="sd"> of the action space</span>
<span class="sd"> :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases</span>
<span class="sd"> :param noise_schedule: the schedule for the noise variance</span>
<span class="sd"> :param evaluation_noise: the noise variance that will be used during evaluation phases</span>
<span class="sd"> :param noise_as_percentage_from_action_space: whether to consider the noise as a percentage of the action space</span>
<span class="sd"> or absolute value</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">action_space</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span> <span class="o">=</span> <span class="n">noise_percentage_schedule</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise_percentage</span> <span class="o">=</span> <span class="n">evaluation_noise_percentage</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span> <span class="o">=</span> <span class="n">noise_schedule</span>
<span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise</span> <span class="o">=</span> <span class="n">evaluation_noise</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_as_percentage_from_action_space</span> <span class="o">=</span> <span class="n">noise_as_percentage_from_action_space</span>
<span class="bp">self</span><span class="o">.</span><span class="n">clip_low</span> <span class="o">=</span> <span class="n">clip_low</span>
<span class="bp">self</span><span class="o">.</span><span class="n">clip_high</span> <span class="o">=</span> <span class="n">clip_high</span>
@@ -254,17 +258,21 @@
<span class="ow">or</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">all</span><span class="p">(</span><span class="o">-</span><span class="n">np</span><span class="o">.</span><span class="n">inf</span> <span class="o">&lt;</span> <span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">all</span><span class="p">(</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span> <span class="o">&lt;</span> <span class="n">np</span><span class="o">.</span><span class="n">inf</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Additive noise exploration requires bounded actions&quot;</span><span class="p">)</span>
<span class="c1"># TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage</span>
<span class="k">def</span> <span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ActionType</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">ActionType</span><span class="p">:</span>
<span class="c1"># set the current noise percentage</span>
<span class="c1"># set the current noise</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="o">==</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span><span class="p">:</span>
<span class="n">current_noise_precentage</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise_percentage</span>
<span class="n">current_noise</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">evaluation_noise</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">current_noise_precentage</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span><span class="o">.</span><span class="n">current_value</span>
<span class="n">current_noise</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span><span class="o">.</span><span class="n">current_value</span>
<span class="c1"># scale the noise to the action space range</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise_precentage</span> <span class="o">*</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">high</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">noise_as_percentage_from_action_space</span><span class="p">:</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise</span> <span class="o">*</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">high</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise</span>
<span class="c1"># scale the noise to the action space range</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">current_noise</span> <span class="o">*</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">high</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">low</span><span class="p">)</span>
<span class="c1"># extract the mean values</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_values</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
@@ -276,7 +284,7 @@
<span class="c1"># step the noise schedule</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">phase</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">RunPhase</span><span class="o">.</span><span class="n">TEST</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="c1"># the second element of the list is assumed to be the standard deviation</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">action_values</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">action_values</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">action_values_std</span> <span class="o">=</span> <span class="n">action_values</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">squeeze</span><span class="p">()</span>
@@ -290,7 +298,7 @@
<span class="k">return</span> <span class="n">action</span>
<span class="k">def</span> <span class="nf">get_control_param</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">noise_percentage_schedule</span><span class="o">.</span><span class="n">current_value</span></div>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">noise_schedule</span><span class="o">.</span><span class="n">current_value</span></div>
</pre></div>
</div>