mirror of
https://github.com/gryf/coach.git
synced 2026-03-27 22:03:33 +01:00
pre-release 0.10.0
This commit is contained in:
42
rl_coach/exploration_policies/README.md
Normal file
42
rl_coach/exploration_policies/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Exploration Policy
|
||||
|
||||
An exploration policy is a module that is responsible for choosing the action according to the action values, the
|
||||
current phase, its internal state and the specific exploration policy algorithm.
|
||||
|
||||
A custom exploration policy should implement both the exploration policy class and the exploration policy parameters
|
||||
class, which defines the parameters and the location of the exploration policy module.
|
||||
The parameters of the exploration policy class should match the parameters in the exploration policy parameters class.
|
||||
|
||||
Exploration policies typically have some control parameter that defines its current exploration state, and
|
||||
a schedule for this parameter. This schedule can be defined using the Schedule class which is defined in
|
||||
exploration_policy.py.
|
||||
|
||||
A custom implementation should look as follows:
|
||||
|
||||
```
|
||||
class CustomExplorationParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
...
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'module_path:class_name'
|
||||
|
||||
|
||||
class CustomExplorationPolicy(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace, ...):
|
||||
super().__init__(action_space)
|
||||
|
||||
def reset(self):
|
||||
...
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
...
|
||||
|
||||
def change_phase(self, phase):
|
||||
...
|
||||
|
||||
def get_control_param(self):
|
||||
...
|
||||
```
|
||||
15
rl_coach/exploration_policies/__init__.py
Normal file
15
rl_coach/exploration_policies/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
95
rl_coach/exploration_policies/additive_noise.py
Normal file
95
rl_coach/exploration_policies/additive_noise.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from rl_coach.spaces import ActionSpace, BoxActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
|
||||
|
||||
# TODO: consider renaming to gaussian sampling
|
||||
class AdditiveNoiseParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.evaluation_noise_percentage = 0.05
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise'
|
||||
|
||||
|
||||
class AdditiveNoise(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space
|
||||
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.noise_percentage_schedule = noise_percentage_schedule
|
||||
self.evaluation_noise_percentage = evaluation_noise_percentage
|
||||
|
||||
if not isinstance(action_space, BoxActionSpace):
|
||||
raise ValueError("Additive noise exploration works only for continuous controls."
|
||||
"The given action space is of type: {}".format(action_space.__class__.__name__))
|
||||
|
||||
if not np.all(-np.inf < action_space.high) or not np.all(action_space.high < np.inf)\
|
||||
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
|
||||
raise ValueError("Additive noise exploration requires bounded actions")
|
||||
|
||||
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
# TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies
|
||||
|
||||
# set the current noise percentage
|
||||
if self.phase == RunPhase.TEST:
|
||||
current_noise_precentage = self.evaluation_noise_percentage
|
||||
else:
|
||||
current_noise_precentage = self.noise_percentage_schedule.current_value
|
||||
|
||||
# scale the noise to the action space range
|
||||
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
|
||||
|
||||
# extract the mean values
|
||||
if isinstance(action_values, list):
|
||||
# the action values are expected to be a list with the action mean and optionally the action stdev
|
||||
action_values_mean = action_values[0].squeeze()
|
||||
else:
|
||||
# the action values are expected to be a numpy array representing the action mean
|
||||
action_values_mean = action_values.squeeze()
|
||||
|
||||
# step the noise schedule
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.noise_percentage_schedule.step()
|
||||
# the second element of the list is assumed to be the standard deviation
|
||||
if isinstance(action_values, list) and len(action_values) > 1:
|
||||
action_values_std = action_values[1].squeeze()
|
||||
|
||||
# add noise to the action means
|
||||
action = np.random.normal(action_values_mean, action_values_std)
|
||||
|
||||
return action
|
||||
|
||||
def get_control_param(self):
|
||||
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
|
||||
59
rl_coach/exploration_policies/boltzmann.py
Normal file
59
rl_coach/exploration_policies/boltzmann.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.schedules import Schedule
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
|
||||
|
||||
class BoltzmannParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.temperature_schedule = None
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.boltzmann:Boltzmann'
|
||||
|
||||
|
||||
|
||||
class Boltzmann(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param temperature_schedule: the schedule for the temperature parameter of the softmax
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.temperature_schedule = temperature_schedule
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.temperature_schedule.step()
|
||||
# softmax calculation
|
||||
exp_probabilities = np.exp(action_values / self.temperature_schedule.current_value)
|
||||
probabilities = exp_probabilities / np.sum(exp_probabilities)
|
||||
# make sure probs sum to 1
|
||||
probabilities[-1] = 1 - np.sum(probabilities[:-1])
|
||||
# choose actions according to the probabilities
|
||||
return np.random.choice(range(self.action_space.shape), p=probabilities)
|
||||
|
||||
def get_control_param(self):
|
||||
return self.temperature_schedule.current_value
|
||||
77
rl_coach/exploration_policies/bootstrapped.py
Normal file
77
rl_coach/exploration_policies/bootstrapped.py
Normal file
@@ -0,0 +1,77 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
|
||||
|
||||
|
||||
class BootstrappedParameters(EGreedyParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.architecture_num_q_heads = 10
|
||||
self.bootstrapped_data_sharing_probability = 1.0
|
||||
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.bootstrapped:Bootstrapped'
|
||||
|
||||
|
||||
class Bootstrapped(EGreedy):
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
|
||||
architecture_num_q_heads: int,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param epsilon_schedule: a schedule for the epsilon values
|
||||
:param evaluation_epsilon: the epsilon value to use for evaluation phases
|
||||
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy
|
||||
:param architecture_num_q_heads: the number of q heads to select from
|
||||
"""
|
||||
super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
|
||||
self.num_heads = architecture_num_q_heads
|
||||
self.selected_head = 0
|
||||
self.last_action_values = 0
|
||||
|
||||
def select_head(self):
|
||||
self.selected_head = np.random.randint(self.num_heads)
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
# action values are none in case the exploration policy is going to select a random action
|
||||
if action_values is not None:
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
action_values = action_values[self.selected_head]
|
||||
else:
|
||||
# ensemble voting for evaluation
|
||||
top_action_votings = np.argmax(action_values, axis=-1)
|
||||
counts = np.bincount(top_action_votings.squeeze())
|
||||
top_action = np.argmax(counts)
|
||||
# convert the top action to a one hot vector and pass it to e-greedy
|
||||
action_values = np.eye(len(self.action_space.actions))[[top_action]]
|
||||
self.last_action_values = action_values
|
||||
return super().get_action(action_values)
|
||||
|
||||
def get_control_param(self):
|
||||
return self.selected_head
|
||||
48
rl_coach/exploration_policies/categorical.py
Normal file
48
rl_coach/exploration_policies/categorical.py
Normal file
@@ -0,0 +1,48 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
|
||||
|
||||
class CategoricalParameters(ExplorationParameters):
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.categorical:Categorical'
|
||||
|
||||
|
||||
class Categorical(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
# choose actions according to the probabilities
|
||||
return np.random.choice(self.action_space.actions, p=action_values)
|
||||
else:
|
||||
# take the action with the highest probability
|
||||
return np.argmax(action_values)
|
||||
|
||||
def get_control_param(self):
|
||||
return 0
|
||||
27
rl_coach/exploration_policies/continuous_entropy.py
Normal file
27
rl_coach/exploration_policies/continuous_entropy.py
Normal file
@@ -0,0 +1,27 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoise, AdditiveNoiseParameters
|
||||
|
||||
|
||||
class ContinuousEntropyParameters(AdditiveNoiseParameters):
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.continuous_entropy:ContinuousEntropy'
|
||||
|
||||
|
||||
class ContinuousEntropy(AdditiveNoise):
|
||||
pass
|
||||
102
rl_coach/exploration_policies/e_greedy.py
Normal file
102
rl_coach/exploration_policies/e_greedy.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
|
||||
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy
|
||||
|
||||
|
||||
class EGreedyParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
|
||||
self.evaluation_epsilon = 0.05
|
||||
self.continuous_exploration_policy_parameters = AdditiveNoiseParameters()
|
||||
self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
# for continuous control -
|
||||
# (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.e_greedy:EGreedy'
|
||||
|
||||
|
||||
class EGreedy(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
|
||||
evaluation_epsilon: float,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param epsilon_schedule: a schedule for the epsilon values
|
||||
:param evaluation_epsilon: the epsilon value to use for evaluation phases
|
||||
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.epsilon_schedule = epsilon_schedule
|
||||
self.evaluation_epsilon = evaluation_epsilon
|
||||
|
||||
if isinstance(self.action_space, BoxActionSpace):
|
||||
# for continuous e-greedy (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
|
||||
continuous_exploration_policy_parameters.action_space = action_space
|
||||
self.continuous_exploration_policy = \
|
||||
dynamic_import_and_instantiate_module_from_params(continuous_exploration_policy_parameters)
|
||||
|
||||
self.current_random_value = np.random.rand()
|
||||
|
||||
def requires_action_values(self):
|
||||
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
|
||||
return self.current_random_value >= epsilon
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
|
||||
|
||||
if isinstance(self.action_space, DiscreteActionSpace):
|
||||
top_action = np.argmax(action_values)
|
||||
if self.current_random_value < epsilon:
|
||||
chosen_action = self.action_space.sample()
|
||||
else:
|
||||
chosen_action = top_action
|
||||
else:
|
||||
if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN:
|
||||
chosen_action = self.action_space.sample()
|
||||
else:
|
||||
chosen_action = self.continuous_exploration_policy.get_action(action_values)
|
||||
|
||||
# step the epsilon schedule and generate a new random value for next time
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.epsilon_schedule.step()
|
||||
self.current_random_value = np.random.rand()
|
||||
return chosen_action
|
||||
|
||||
def get_control_param(self):
|
||||
if isinstance(self.action_space, DiscreteActionSpace):
|
||||
return self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
|
||||
elif isinstance(self.action_space, BoxActionSpace):
|
||||
return self.continuous_exploration_policy.get_control_param()
|
||||
|
||||
def change_phase(self, phase):
|
||||
super().change_phase(phase)
|
||||
if isinstance(self.action_space, BoxActionSpace):
|
||||
self.continuous_exploration_policy.change_phase(phase)
|
||||
76
rl_coach/exploration_policies/exploration_policy.py
Normal file
76
rl_coach/exploration_policies/exploration_policy.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
from rl_coach.base_parameters import Parameters
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
|
||||
|
||||
class ExplorationParameters(Parameters):
|
||||
def __init__(self):
|
||||
self.action_space = None
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.exploration_policy:ExplorationPolicy'
|
||||
|
||||
|
||||
class ExplorationPolicy(object):
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
"""
|
||||
self.phase = RunPhase.HEATUP
|
||||
self.action_space = action_space
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Used for resetting the exploration policy parameters when needed
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
"""
|
||||
Given a list of values corresponding to each action,
|
||||
choose one actions according to the exploration policy
|
||||
:param action_values: A list of action values
|
||||
:return: The chosen action
|
||||
"""
|
||||
pass
|
||||
|
||||
def change_phase(self, phase):
|
||||
"""
|
||||
Change between running phases of the algorithm
|
||||
:param phase: Either Heatup or Train
|
||||
:return: none
|
||||
"""
|
||||
self.phase = phase
|
||||
|
||||
def requires_action_values(self) -> bool:
|
||||
"""
|
||||
Allows exploration policies to define if they require the action values for the current step.
|
||||
This can save up a lot of computation. For example in e-greedy, if the random value generated is smaller
|
||||
than epsilon, the action is completely random, and the action values don't need to be calculated
|
||||
:return: True if the action values are required. False otherwise
|
||||
"""
|
||||
return True
|
||||
|
||||
def get_control_param(self):
|
||||
return 0
|
||||
46
rl_coach/exploration_policies/greedy.py
Normal file
46
rl_coach/exploration_policies/greedy.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
|
||||
|
||||
from rl_coach.core_types import ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
|
||||
|
||||
class GreedyParameters(ExplorationParameters):
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.greedy:Greedy'
|
||||
|
||||
|
||||
class Greedy(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
if type(self.action_space) == DiscreteActionSpace:
|
||||
return np.argmax(action_values)
|
||||
if type(self.action_space) == BoxActionSpace:
|
||||
return action_values
|
||||
|
||||
def get_control_param(self):
|
||||
return 0
|
||||
81
rl_coach/exploration_policies/ou_process.py
Normal file
81
rl_coach/exploration_policies/ou_process.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
|
||||
|
||||
# Based on on the description in:
|
||||
# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
|
||||
class OUProcessParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.mu = 0
|
||||
self.theta = 0.15
|
||||
self.sigma = 0.2
|
||||
self.dt = 0.01
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.ou_process:OUProcess'
|
||||
|
||||
|
||||
# Ornstein-Uhlenbeck process
|
||||
class OUProcess(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.mu = float(mu) * np.ones(self.action_space.shape)
|
||||
self.theta = float(theta)
|
||||
self.sigma = float(sigma) * np.ones(self.action_space.shape)
|
||||
self.state = np.zeros(self.action_space.shape)
|
||||
self.dt = dt
|
||||
|
||||
if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)):
|
||||
raise ValueError("OU process exploration works only for continuous controls."
|
||||
"The given action space is of type: {}".format(action_space.__class__.__name__))
|
||||
|
||||
def reset(self):
|
||||
self.state = np.zeros(self.action_space.shape)
|
||||
|
||||
def noise(self):
|
||||
x = self.state
|
||||
dx = self.theta * (self.mu - x) * self.dt + self.sigma * np.random.randn(len(x)) * np.sqrt(self.dt)
|
||||
self.state = x + dx
|
||||
return self.state
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
noise = self.noise()
|
||||
else:
|
||||
noise = np.zeros(self.action_space.shape)
|
||||
|
||||
action = action_values.squeeze() + noise
|
||||
|
||||
return action
|
||||
|
||||
def get_control_param(self):
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
return self.state
|
||||
else:
|
||||
return np.zeros(self.action_space.shape)
|
||||
100
rl_coach/exploration_policies/truncated_normal.py
Normal file
100
rl_coach/exploration_policies/truncated_normal.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from scipy.stats import truncnorm
|
||||
from rl_coach.spaces import ActionSpace, BoxActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
|
||||
|
||||
class TruncatedNormalParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.evaluation_noise_percentage = 0.05
|
||||
self.clip_low = 0
|
||||
self.clip_high = 1
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal'
|
||||
|
||||
|
||||
class TruncatedNormal(ExplorationPolicy):
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space
|
||||
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.noise_percentage_schedule = noise_percentage_schedule
|
||||
self.evaluation_noise_percentage = evaluation_noise_percentage
|
||||
self.clip_low = clip_low
|
||||
self.clip_high = clip_high
|
||||
|
||||
if not isinstance(action_space, BoxActionSpace):
|
||||
raise ValueError("Truncated normal exploration works only for continuous controls."
|
||||
"The given action space is of type: {}".format(action_space.__class__.__name__))
|
||||
|
||||
if not np.all(-np.inf < action_space.high) or not np.all(action_space.high < np.inf)\
|
||||
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
|
||||
raise ValueError("Additive noise exploration requires bounded actions")
|
||||
|
||||
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
# set the current noise percentage
|
||||
if self.phase == RunPhase.TEST:
|
||||
current_noise_precentage = self.evaluation_noise_percentage
|
||||
else:
|
||||
current_noise_precentage = self.noise_percentage_schedule.current_value
|
||||
|
||||
# scale the noise to the action space range
|
||||
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
|
||||
|
||||
# extract the mean values
|
||||
if isinstance(action_values, list):
|
||||
# the action values are expected to be a list with the action mean and optionally the action stdev
|
||||
action_values_mean = action_values[0].squeeze()
|
||||
else:
|
||||
# the action values are expected to be a numpy array representing the action mean
|
||||
action_values_mean = action_values.squeeze()
|
||||
|
||||
# step the noise schedule
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.noise_percentage_schedule.step()
|
||||
# the second element of the list is assumed to be the standard deviation
|
||||
if isinstance(action_values, list) and len(action_values) > 1:
|
||||
action_values_std = action_values[1].squeeze()
|
||||
|
||||
# sample from truncated normal distribution
|
||||
normalized_low = (self.clip_low - action_values_mean) / action_values_std
|
||||
normalized_high = (self.clip_high - action_values_mean) / action_values_std
|
||||
distribution = truncnorm(normalized_low, normalized_high, loc=action_values_mean, scale=action_values_std)
|
||||
action = distribution.rvs(1)
|
||||
|
||||
return action
|
||||
|
||||
def get_control_param(self):
|
||||
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
|
||||
83
rl_coach/exploration_policies/ucb.py
Normal file
83
rl_coach/exploration_policies/ucb.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
|
||||
from rl_coach.schedules import Schedule, LinearSchedule, PieceWiseSchedule
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType, EnvironmentSteps
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
|
||||
|
||||
|
||||
class UCBParameters(EGreedyParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.architecture_num_q_heads = 10
|
||||
self.bootstrapped_data_sharing_probability = 1.0
|
||||
self.epsilon_schedule = PieceWiseSchedule([
|
||||
(LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
|
||||
(LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000))
|
||||
])
|
||||
self.lamb = 0.1
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.exploration_policies.ucb:UCB'
|
||||
|
||||
|
||||
class UCB(EGreedy):
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
|
||||
architecture_num_q_heads: int, lamb: int,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param epsilon_schedule: a schedule for the epsilon values
|
||||
:param evaluation_epsilon: the epsilon value to use for evaluation phases
|
||||
:param architecture_num_q_heads: the number of q heads to select from
|
||||
:param lamb: lambda coefficient for taking the standard deviation into account
|
||||
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
|
||||
if the e-greedy is used for a continuous policy
|
||||
"""
|
||||
super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
|
||||
self.num_heads = architecture_num_q_heads
|
||||
self.lamb = lamb
|
||||
self.std = 0
|
||||
self.last_action_values = 0
|
||||
|
||||
def select_head(self):
|
||||
pass
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
# action values are none in case the exploration policy is going to select a random action
|
||||
if action_values is not None:
|
||||
if self.requires_action_values():
|
||||
mean = np.mean(action_values, axis=0)
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.std = np.std(action_values, axis=0)
|
||||
self.last_action_values = mean + self.lamb * self.std
|
||||
else:
|
||||
self.last_action_values = mean
|
||||
return super().get_action(self.last_action_values)
|
||||
|
||||
def get_control_param(self):
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
return np.mean(self.std)
|
||||
else:
|
||||
return 0
|
||||
Reference in New Issue
Block a user