1
0
mirror of https://github.com/gryf/coach.git synced 2026-03-11 12:05:47 +01:00

Create a dataset using an agent (#306)

Generate a dataset using an agent (allowing to select between this and a random dataset)
This commit is contained in:
Gal Leibovich
2019-05-28 09:34:49 +03:00
committed by GitHub
parent 342b7184bc
commit 9e9c4fd332
26 changed files with 351 additions and 111 deletions

View File

@@ -17,14 +17,17 @@
from typing import List
import numpy as np
import scipy.stats
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace, BoxActionSpace
# TODO: consider renaming to gaussian sampling
class AdditiveNoiseParameters(ExplorationParameters):
def __init__(self):
super().__init__()
@@ -36,7 +39,7 @@ class AdditiveNoiseParameters(ExplorationParameters):
return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise'
class AdditiveNoise(ExplorationPolicy):
class AdditiveNoise(ContinuousActionExplorationPolicy):
"""
AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that

View File

@@ -19,7 +19,7 @@ from typing import List
import numpy as np
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters
from rl_coach.schedules import Schedule
from rl_coach.spaces import ActionSpace
@@ -34,8 +34,7 @@ class BoltzmannParameters(ExplorationParameters):
return 'rl_coach.exploration_policies.boltzmann:Boltzmann'
class Boltzmann(ExplorationPolicy):
class Boltzmann(DiscreteActionExplorationPolicy):
"""
The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
@@ -50,7 +49,7 @@ class Boltzmann(ExplorationPolicy):
super().__init__(action_space)
self.temperature_schedule = temperature_schedule
def get_action(self, action_values: List[ActionType]) -> ActionType:
def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
if self.phase == RunPhase.TRAIN:
self.temperature_schedule.step()
# softmax calculation
@@ -59,7 +58,8 @@ class Boltzmann(ExplorationPolicy):
# make sure probs sum to 1
probabilities[-1] = 1 - np.sum(probabilities[:-1])
# choose actions according to the probabilities
return np.random.choice(range(self.action_space.shape), p=probabilities)
action = np.random.choice(range(self.action_space.shape), p=probabilities)
return action, probabilities
def get_control_param(self):
return self.temperature_schedule.current_value

View File

@@ -19,7 +19,7 @@ from typing import List
import numpy as np
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters
from rl_coach.spaces import ActionSpace
@@ -29,7 +29,7 @@ class CategoricalParameters(ExplorationParameters):
return 'rl_coach.exploration_policies.categorical:Categorical'
class Categorical(ExplorationPolicy):
class Categorical(DiscreteActionExplorationPolicy):
"""
Categorical exploration policy is intended for discrete action spaces. It expects the action values to
represent a probability distribution over the action, from which a single action will be sampled.
@@ -42,13 +42,18 @@ class Categorical(ExplorationPolicy):
"""
super().__init__(action_space)
def get_action(self, action_values: List[ActionType]) -> ActionType:
def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
if self.phase == RunPhase.TRAIN:
# choose actions according to the probabilities
return np.random.choice(self.action_space.actions, p=action_values)
action = np.random.choice(self.action_space.actions, p=action_values)
return action, action_values
else:
# take the action with the highest probability
return np.argmax(action_values)
action = np.argmax(action_values)
one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
one_hot_action_probabilities[action] = 1
return action, one_hot_action_probabilities
def get_control_param(self):
return 0

View File

@@ -20,8 +20,7 @@ import numpy as np
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
@@ -82,26 +81,32 @@ class EGreedy(ExplorationPolicy):
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
return self.current_random_value >= epsilon
def get_action(self, action_values: List[ActionType]) -> ActionType:
def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
if isinstance(self.action_space, DiscreteActionSpace):
top_action = np.argmax(action_values)
if self.current_random_value < epsilon:
chosen_action = self.action_space.sample()
probabilities = np.full(len(self.action_space.actions),
1. / (self.action_space.high[0] - self.action_space.low[0] + 1))
else:
chosen_action = top_action
chosen_action = np.argmax(action_values)
# one-hot probabilities vector
probabilities = np.zeros(len(self.action_space.actions))
probabilities[chosen_action] = 1
self.step_epsilon()
return chosen_action, probabilities
else:
if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN:
chosen_action = self.action_space.sample()
else:
chosen_action = self.continuous_exploration_policy.get_action(action_values)
# step the epsilon schedule and generate a new random value for next time
if self.phase == RunPhase.TRAIN:
self.epsilon_schedule.step()
self.current_random_value = np.random.rand()
return chosen_action
self.step_epsilon()
return chosen_action
def get_control_param(self):
if isinstance(self.action_space, DiscreteActionSpace):
@@ -113,3 +118,9 @@ class EGreedy(ExplorationPolicy):
super().change_phase(phase)
if isinstance(self.action_space, BoxActionSpace):
self.continuous_exploration_policy.change_phase(phase)
def step_epsilon(self):
# step the epsilon schedule and generate a new random value for next time
if self.phase == RunPhase.TRAIN:
self.epsilon_schedule.step()
self.current_random_value = np.random.rand()

View File

@@ -18,7 +18,7 @@ from typing import List
from rl_coach.base_parameters import Parameters
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.spaces import ActionSpace
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace, GoalsSpace
class ExplorationParameters(Parameters):
@@ -54,14 +54,10 @@ class ExplorationPolicy(object):
Given a list of values corresponding to each action,
choose one actions according to the exploration policy
:param action_values: A list of action values
:return: The chosen action
:return: The chosen action,
The probability of the action (if available, otherwise 1 for absolute certainty in the action)
"""
if self.__class__ == ExplorationPolicy:
raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. "
"Please set the exploration parameters to point to an inheriting class like EGreedy or "
"AdditiveNoise")
else:
raise ValueError("The get_action function should be overridden in the inheriting exploration class")
raise NotImplementedError()
def change_phase(self, phase):
"""
@@ -82,3 +78,42 @@ class ExplorationPolicy(object):
def get_control_param(self):
return 0
class DiscreteActionExplorationPolicy(ExplorationPolicy):
"""
A discrete action exploration policy.
"""
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment
"""
assert isinstance(action_space, DiscreteActionSpace)
super().__init__(action_space)
def get_action(self, action_values: List[ActionType]) -> (ActionType, List):
"""
Given a list of values corresponding to each action,
choose one actions according to the exploration policy
:param action_values: A list of action values
:return: The chosen action,
The probabilities of actions to select from (if not available a one-hot vector)
"""
if self.__class__ == ExplorationPolicy:
raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. "
"Please set the exploration parameters to point to an inheriting class like EGreedy or "
"AdditiveNoise")
else:
raise ValueError("The get_action function should be overridden in the inheriting exploration class")
class ContinuousActionExplorationPolicy(ExplorationPolicy):
"""
A continuous action exploration policy.
"""
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment
"""
assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)
super().__init__(action_space)

View File

@@ -19,7 +19,7 @@ from typing import List
import numpy as np
from rl_coach.core_types import ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
@@ -41,9 +41,12 @@ class Greedy(ExplorationPolicy):
"""
super().__init__(action_space)
def get_action(self, action_values: List[ActionType]) -> ActionType:
def get_action(self, action_values: List[ActionType]):
if type(self.action_space) == DiscreteActionSpace:
return np.argmax(action_values)
action = np.argmax(action_values)
one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
one_hot_action_probabilities[action] = 1
return action, one_hot_action_probabilities
if type(self.action_space) == BoxActionSpace:
return action_values

View File

@@ -19,12 +19,13 @@ from typing import List
import numpy as np
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace
# Based on on the description in:
# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OUProcessParameters(ExplorationParameters):
def __init__(self):
super().__init__()
@@ -39,7 +40,7 @@ class OUProcessParameters(ExplorationParameters):
# Ornstein-Uhlenbeck process
class OUProcess(ExplorationPolicy):
class OUProcess(ContinuousActionExplorationPolicy):
"""
OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
@@ -56,10 +57,6 @@ class OUProcess(ExplorationPolicy):
self.state = np.zeros(self.action_space.shape)
self.dt = dt
if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)):
raise ValueError("OU process exploration works only for continuous controls."
"The given action space is of type: {}".format(action_space.__class__.__name__))
def reset(self):
self.state = np.zeros(self.action_space.shape)

View File

@@ -59,9 +59,13 @@ class ParameterNoise(ExplorationPolicy):
self.network_params = network_params
self._replace_network_dense_layers()
def get_action(self, action_values: List[ActionType]) -> ActionType:
def get_action(self, action_values: List[ActionType]):
if type(self.action_space) == DiscreteActionSpace:
return np.argmax(action_values)
action = np.argmax(action_values)
one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
one_hot_action_probabilities[action] = 1
return action, one_hot_action_probabilities
elif type(self.action_space) == BoxActionSpace:
action_values_mean = action_values[0].squeeze()
action_values_std = action_values[1].squeeze()

View File

@@ -20,7 +20,7 @@ import numpy as np
from scipy.stats import truncnorm
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ContinuousActionExplorationPolicy
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace, BoxActionSpace
@@ -38,7 +38,7 @@ class TruncatedNormalParameters(ExplorationParameters):
return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal'
class TruncatedNormal(ExplorationPolicy):
class TruncatedNormal(ContinuousActionExplorationPolicy):
"""
The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t