1
0
mirror of https://github.com/gryf/coach.git synced 2026-03-27 22:03:33 +01:00

pre-release 0.10.0

This commit is contained in:
Gal Novik
2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions

View File

@@ -0,0 +1,42 @@
# Exploration Policy
An exploration policy is a module that is responsible for choosing the action according to the action values, the
current phase, its internal state and the specific exploration policy algorithm.
A custom exploration policy should implement both the exploration policy class and the exploration policy parameters
class, which defines the parameters and the location of the exploration policy module.
The parameters of the exploration policy class should match the parameters in the exploration policy parameters class.
Exploration policies typically have some control parameter that defines its current exploration state, and
a schedule for this parameter. This schedule can be defined using the Schedule class which is defined in
exploration_policy.py.
A custom implementation should look as follows:
```
class CustomExplorationParameters(ExplorationParameters):
def __init__(self):
super().__init__()
...
@property
def path(self):
return 'module_path:class_name'
class CustomExplorationPolicy(ExplorationPolicy):
def __init__(self, action_space: ActionSpace, ...):
super().__init__(action_space)
def reset(self):
...
def get_action(self, action_values: List[ActionType]) -> ActionType:
...
def change_phase(self, phase):
...
def get_control_param(self):
...
```

View File

@@ -0,0 +1,15 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@@ -0,0 +1,95 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace, BoxActionSpace
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
# TODO: consider renaming to gaussian sampling
class AdditiveNoiseParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
self.evaluation_noise_percentage = 0.05
@property
def path(self):
return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise'
class AdditiveNoise(ExplorationPolicy):
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
evaluation_noise_percentage: float):
"""
:param action_space: the action space used by the environment
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
of the action space
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
"""
super().__init__(action_space)
self.noise_percentage_schedule = noise_percentage_schedule
self.evaluation_noise_percentage = evaluation_noise_percentage
if not isinstance(action_space, BoxActionSpace):
raise ValueError("Additive noise exploration works only for continuous controls."
"The given action space is of type: {}".format(action_space.__class__.__name__))
if not np.all(-np.inf < action_space.high) or not np.all(action_space.high < np.inf)\
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
raise ValueError("Additive noise exploration requires bounded actions")
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
def get_action(self, action_values: List[ActionType]) -> ActionType:
# TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies
# set the current noise percentage
if self.phase == RunPhase.TEST:
current_noise_precentage = self.evaluation_noise_percentage
else:
current_noise_precentage = self.noise_percentage_schedule.current_value
# scale the noise to the action space range
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
# extract the mean values
if isinstance(action_values, list):
# the action values are expected to be a list with the action mean and optionally the action stdev
action_values_mean = action_values[0].squeeze()
else:
# the action values are expected to be a numpy array representing the action mean
action_values_mean = action_values.squeeze()
# step the noise schedule
if self.phase == RunPhase.TRAIN:
self.noise_percentage_schedule.step()
# the second element of the list is assumed to be the standard deviation
if isinstance(action_values, list) and len(action_values) > 1:
action_values_std = action_values[1].squeeze()
# add noise to the action means
action = np.random.normal(action_values_mean, action_values_std)
return action
def get_control_param(self):
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value

View File

@@ -0,0 +1,59 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.schedules import Schedule
from rl_coach.spaces import ActionSpace
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
class BoltzmannParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.temperature_schedule = None
@property
def path(self):
return 'rl_coach.exploration_policies.boltzmann:Boltzmann'
class Boltzmann(ExplorationPolicy):
def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
"""
:param action_space: the action space used by the environment
:param temperature_schedule: the schedule for the temperature parameter of the softmax
"""
super().__init__(action_space)
self.temperature_schedule = temperature_schedule
def get_action(self, action_values: List[ActionType]) -> ActionType:
if self.phase == RunPhase.TRAIN:
self.temperature_schedule.step()
# softmax calculation
exp_probabilities = np.exp(action_values / self.temperature_schedule.current_value)
probabilities = exp_probabilities / np.sum(exp_probabilities)
# make sure probs sum to 1
probabilities[-1] = 1 - np.sum(probabilities[:-1])
# choose actions according to the probabilities
return np.random.choice(range(self.action_space.shape), p=probabilities)
def get_control_param(self):
return self.temperature_schedule.current_value

View File

@@ -0,0 +1,77 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
class BootstrappedParameters(EGreedyParameters):
def __init__(self):
super().__init__()
self.architecture_num_q_heads = 10
self.bootstrapped_data_sharing_probability = 1.0
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
@property
def path(self):
return 'rl_coach.exploration_policies.bootstrapped:Bootstrapped'
class Bootstrapped(EGreedy):
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
architecture_num_q_heads: int,
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
"""
:param action_space: the action space used by the environment
:param epsilon_schedule: a schedule for the epsilon values
:param evaluation_epsilon: the epsilon value to use for evaluation phases
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
if the e-greedy is used for a continuous policy
:param architecture_num_q_heads: the number of q heads to select from
"""
super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
self.num_heads = architecture_num_q_heads
self.selected_head = 0
self.last_action_values = 0
def select_head(self):
self.selected_head = np.random.randint(self.num_heads)
def get_action(self, action_values: List[ActionType]) -> ActionType:
# action values are none in case the exploration policy is going to select a random action
if action_values is not None:
if self.phase == RunPhase.TRAIN:
action_values = action_values[self.selected_head]
else:
# ensemble voting for evaluation
top_action_votings = np.argmax(action_values, axis=-1)
counts = np.bincount(top_action_votings.squeeze())
top_action = np.argmax(counts)
# convert the top action to a one hot vector and pass it to e-greedy
action_values = np.eye(len(self.action_space.actions))[[top_action]]
self.last_action_values = action_values
return super().get_action(action_values)
def get_control_param(self):
return self.selected_head

View File

@@ -0,0 +1,48 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.spaces import ActionSpace
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
class CategoricalParameters(ExplorationParameters):
@property
def path(self):
return 'rl_coach.exploration_policies.categorical:Categorical'
class Categorical(ExplorationPolicy):
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment
"""
super().__init__(action_space)
def get_action(self, action_values: List[ActionType]) -> ActionType:
if self.phase == RunPhase.TRAIN:
# choose actions according to the probabilities
return np.random.choice(self.action_space.actions, p=action_values)
else:
# take the action with the highest probability
return np.argmax(action_values)
def get_control_param(self):
return 0

View File

@@ -0,0 +1,27 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from rl_coach.exploration_policies.additive_noise import AdditiveNoise, AdditiveNoiseParameters
class ContinuousEntropyParameters(AdditiveNoiseParameters):
@property
def path(self):
return 'rl_coach.exploration_policies.continuous_entropy:ContinuousEntropy'
class ContinuousEntropy(AdditiveNoise):
pass

View File

@@ -0,0 +1,102 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy
class EGreedyParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
self.evaluation_epsilon = 0.05
self.continuous_exploration_policy_parameters = AdditiveNoiseParameters()
self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
# for continuous control -
# (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
@property
def path(self):
return 'rl_coach.exploration_policies.e_greedy:EGreedy'
class EGreedy(ExplorationPolicy):
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
evaluation_epsilon: float,
continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
"""
:param action_space: the action space used by the environment
:param epsilon_schedule: a schedule for the epsilon values
:param evaluation_epsilon: the epsilon value to use for evaluation phases
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
if the e-greedy is used for a continuous policy
"""
super().__init__(action_space)
self.epsilon_schedule = epsilon_schedule
self.evaluation_epsilon = evaluation_epsilon
if isinstance(self.action_space, BoxActionSpace):
# for continuous e-greedy (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
continuous_exploration_policy_parameters.action_space = action_space
self.continuous_exploration_policy = \
dynamic_import_and_instantiate_module_from_params(continuous_exploration_policy_parameters)
self.current_random_value = np.random.rand()
def requires_action_values(self):
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
return self.current_random_value >= epsilon
def get_action(self, action_values: List[ActionType]) -> ActionType:
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
if isinstance(self.action_space, DiscreteActionSpace):
top_action = np.argmax(action_values)
if self.current_random_value < epsilon:
chosen_action = self.action_space.sample()
else:
chosen_action = top_action
else:
if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN:
chosen_action = self.action_space.sample()
else:
chosen_action = self.continuous_exploration_policy.get_action(action_values)
# step the epsilon schedule and generate a new random value for next time
if self.phase == RunPhase.TRAIN:
self.epsilon_schedule.step()
self.current_random_value = np.random.rand()
return chosen_action
def get_control_param(self):
if isinstance(self.action_space, DiscreteActionSpace):
return self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
elif isinstance(self.action_space, BoxActionSpace):
return self.continuous_exploration_policy.get_control_param()
def change_phase(self, phase):
super().change_phase(phase)
if isinstance(self.action_space, BoxActionSpace):
self.continuous_exploration_policy.change_phase(phase)

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
from rl_coach.base_parameters import Parameters
from rl_coach.spaces import ActionSpace
from rl_coach.core_types import RunPhase, ActionType
class ExplorationParameters(Parameters):
def __init__(self):
self.action_space = None
@property
def path(self):
return 'rl_coach.exploration_policies.exploration_policy:ExplorationPolicy'
class ExplorationPolicy(object):
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment
"""
self.phase = RunPhase.HEATUP
self.action_space = action_space
def reset(self):
"""
Used for resetting the exploration policy parameters when needed
:return: None
"""
pass
def get_action(self, action_values: List[ActionType]) -> ActionType:
"""
Given a list of values corresponding to each action,
choose one actions according to the exploration policy
:param action_values: A list of action values
:return: The chosen action
"""
pass
def change_phase(self, phase):
"""
Change between running phases of the algorithm
:param phase: Either Heatup or Train
:return: none
"""
self.phase = phase
def requires_action_values(self) -> bool:
"""
Allows exploration policies to define if they require the action values for the current step.
This can save up a lot of computation. For example in e-greedy, if the random value generated is smaller
than epsilon, the action is completely random, and the action values don't need to be calculated
:return: True if the action values are required. False otherwise
"""
return True
def get_control_param(self):
return 0

View File

@@ -0,0 +1,46 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
from rl_coach.core_types import ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
class GreedyParameters(ExplorationParameters):
@property
def path(self):
return 'rl_coach.exploration_policies.greedy:Greedy'
class Greedy(ExplorationPolicy):
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment
"""
super().__init__(action_space)
def get_action(self, action_values: List[ActionType]) -> ActionType:
if type(self.action_space) == DiscreteActionSpace:
return np.argmax(action_values)
if type(self.action_space) == BoxActionSpace:
return action_values
def get_control_param(self):
return 0

View File

@@ -0,0 +1,81 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
# Based on on the description in:
# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OUProcessParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.mu = 0
self.theta = 0.15
self.sigma = 0.2
self.dt = 0.01
@property
def path(self):
return 'rl_coach.exploration_policies.ou_process:OUProcess'
# Ornstein-Uhlenbeck process
class OUProcess(ExplorationPolicy):
def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
"""
:param action_space: the action space used by the environment
"""
super().__init__(action_space)
self.mu = float(mu) * np.ones(self.action_space.shape)
self.theta = float(theta)
self.sigma = float(sigma) * np.ones(self.action_space.shape)
self.state = np.zeros(self.action_space.shape)
self.dt = dt
if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)):
raise ValueError("OU process exploration works only for continuous controls."
"The given action space is of type: {}".format(action_space.__class__.__name__))
def reset(self):
self.state = np.zeros(self.action_space.shape)
def noise(self):
x = self.state
dx = self.theta * (self.mu - x) * self.dt + self.sigma * np.random.randn(len(x)) * np.sqrt(self.dt)
self.state = x + dx
return self.state
def get_action(self, action_values: List[ActionType]) -> ActionType:
if self.phase == RunPhase.TRAIN:
noise = self.noise()
else:
noise = np.zeros(self.action_space.shape)
action = action_values.squeeze() + noise
return action
def get_control_param(self):
if self.phase == RunPhase.TRAIN:
return self.state
else:
return np.zeros(self.action_space.shape)

View File

@@ -0,0 +1,100 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.schedules import Schedule, LinearSchedule
from scipy.stats import truncnorm
from rl_coach.spaces import ActionSpace, BoxActionSpace
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
class TruncatedNormalParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
self.evaluation_noise_percentage = 0.05
self.clip_low = 0
self.clip_high = 1
@property
def path(self):
return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal'
class TruncatedNormal(ExplorationPolicy):
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
"""
:param action_space: the action space used by the environment
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
of the action space
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
"""
super().__init__(action_space)
self.noise_percentage_schedule = noise_percentage_schedule
self.evaluation_noise_percentage = evaluation_noise_percentage
self.clip_low = clip_low
self.clip_high = clip_high
if not isinstance(action_space, BoxActionSpace):
raise ValueError("Truncated normal exploration works only for continuous controls."
"The given action space is of type: {}".format(action_space.__class__.__name__))
if not np.all(-np.inf < action_space.high) or not np.all(action_space.high < np.inf)\
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
raise ValueError("Additive noise exploration requires bounded actions")
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
def get_action(self, action_values: List[ActionType]) -> ActionType:
# set the current noise percentage
if self.phase == RunPhase.TEST:
current_noise_precentage = self.evaluation_noise_percentage
else:
current_noise_precentage = self.noise_percentage_schedule.current_value
# scale the noise to the action space range
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
# extract the mean values
if isinstance(action_values, list):
# the action values are expected to be a list with the action mean and optionally the action stdev
action_values_mean = action_values[0].squeeze()
else:
# the action values are expected to be a numpy array representing the action mean
action_values_mean = action_values.squeeze()
# step the noise schedule
if self.phase == RunPhase.TRAIN:
self.noise_percentage_schedule.step()
# the second element of the list is assumed to be the standard deviation
if isinstance(action_values, list) and len(action_values) > 1:
action_values_std = action_values[1].squeeze()
# sample from truncated normal distribution
normalized_low = (self.clip_low - action_values_mean) / action_values_std
normalized_high = (self.clip_high - action_values_mean) / action_values_std
distribution = truncnorm(normalized_low, normalized_high, loc=action_values_mean, scale=action_values_std)
action = distribution.rvs(1)
return action
def get_control_param(self):
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value

View File

@@ -0,0 +1,83 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import numpy as np
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
from rl_coach.schedules import Schedule, LinearSchedule, PieceWiseSchedule
from rl_coach.spaces import ActionSpace
from rl_coach.core_types import RunPhase, ActionType, EnvironmentSteps
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
class UCBParameters(EGreedyParameters):
def __init__(self):
super().__init__()
self.architecture_num_q_heads = 10
self.bootstrapped_data_sharing_probability = 1.0
self.epsilon_schedule = PieceWiseSchedule([
(LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
(LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000))
])
self.lamb = 0.1
@property
def path(self):
return 'rl_coach.exploration_policies.ucb:UCB'
class UCB(EGreedy):
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
architecture_num_q_heads: int, lamb: int,
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
"""
:param action_space: the action space used by the environment
:param epsilon_schedule: a schedule for the epsilon values
:param evaluation_epsilon: the epsilon value to use for evaluation phases
:param architecture_num_q_heads: the number of q heads to select from
:param lamb: lambda coefficient for taking the standard deviation into account
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
if the e-greedy is used for a continuous policy
"""
super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
self.num_heads = architecture_num_q_heads
self.lamb = lamb
self.std = 0
self.last_action_values = 0
def select_head(self):
pass
def get_action(self, action_values: List[ActionType]) -> ActionType:
# action values are none in case the exploration policy is going to select a random action
if action_values is not None:
if self.requires_action_values():
mean = np.mean(action_values, axis=0)
if self.phase == RunPhase.TRAIN:
self.std = np.std(action_values, axis=0)
self.last_action_values = mean + self.lamb * self.std
else:
self.last_action_values = mean
return super().get_action(self.last_action_values)
def get_control_param(self):
if self.phase == RunPhase.TRAIN:
return np.mean(self.std)
else:
return 0