mirror of
https://github.com/gryf/coach.git
synced 2026-03-11 12:05:47 +01:00
Create a dataset using an agent (#306)
Generate a dataset using an agent (allowing to select between this and a random dataset)
This commit is contained in:
@@ -17,14 +17,17 @@
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from rl_coach.spaces import ActionSpace, BoxActionSpace
|
||||
|
||||
|
||||
# TODO: consider renaming to gaussian sampling
|
||||
|
||||
|
||||
class AdditiveNoiseParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -36,7 +39,7 @@ class AdditiveNoiseParameters(ExplorationParameters):
|
||||
return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise'
|
||||
|
||||
|
||||
class AdditiveNoise(ExplorationPolicy):
|
||||
class AdditiveNoise(ContinuousActionExplorationPolicy):
|
||||
"""
|
||||
AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
|
||||
and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
|
||||
|
||||
@@ -19,7 +19,7 @@ from typing import List
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.schedules import Schedule
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
@@ -34,8 +34,7 @@ class BoltzmannParameters(ExplorationParameters):
|
||||
return 'rl_coach.exploration_policies.boltzmann:Boltzmann'
|
||||
|
||||
|
||||
|
||||
class Boltzmann(ExplorationPolicy):
|
||||
class Boltzmann(DiscreteActionExplorationPolicy):
|
||||
"""
|
||||
The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
|
||||
actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
|
||||
@@ -50,7 +49,7 @@ class Boltzmann(ExplorationPolicy):
|
||||
super().__init__(action_space)
|
||||
self.temperature_schedule = temperature_schedule
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.temperature_schedule.step()
|
||||
# softmax calculation
|
||||
@@ -59,7 +58,8 @@ class Boltzmann(ExplorationPolicy):
|
||||
# make sure probs sum to 1
|
||||
probabilities[-1] = 1 - np.sum(probabilities[:-1])
|
||||
# choose actions according to the probabilities
|
||||
return np.random.choice(range(self.action_space.shape), p=probabilities)
|
||||
action = np.random.choice(range(self.action_space.shape), p=probabilities)
|
||||
return action, probabilities
|
||||
|
||||
def get_control_param(self):
|
||||
return self.temperature_schedule.current_value
|
||||
|
||||
@@ -19,7 +19,7 @@ from typing import List
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ class CategoricalParameters(ExplorationParameters):
|
||||
return 'rl_coach.exploration_policies.categorical:Categorical'
|
||||
|
||||
|
||||
class Categorical(ExplorationPolicy):
|
||||
class Categorical(DiscreteActionExplorationPolicy):
|
||||
"""
|
||||
Categorical exploration policy is intended for discrete action spaces. It expects the action values to
|
||||
represent a probability distribution over the action, from which a single action will be sampled.
|
||||
@@ -42,13 +42,18 @@ class Categorical(ExplorationPolicy):
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
# choose actions according to the probabilities
|
||||
return np.random.choice(self.action_space.actions, p=action_values)
|
||||
action = np.random.choice(self.action_space.actions, p=action_values)
|
||||
return action, action_values
|
||||
else:
|
||||
# take the action with the highest probability
|
||||
return np.argmax(action_values)
|
||||
action = np.argmax(action_values)
|
||||
one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
|
||||
one_hot_action_probabilities[action] = 1
|
||||
|
||||
return action, one_hot_action_probabilities
|
||||
|
||||
def get_control_param(self):
|
||||
return 0
|
||||
|
||||
@@ -20,8 +20,7 @@ import numpy as np
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
|
||||
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
|
||||
@@ -82,26 +81,32 @@ class EGreedy(ExplorationPolicy):
|
||||
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
|
||||
return self.current_random_value >= epsilon
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
|
||||
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
|
||||
|
||||
if isinstance(self.action_space, DiscreteActionSpace):
|
||||
top_action = np.argmax(action_values)
|
||||
if self.current_random_value < epsilon:
|
||||
chosen_action = self.action_space.sample()
|
||||
probabilities = np.full(len(self.action_space.actions),
|
||||
1. / (self.action_space.high[0] - self.action_space.low[0] + 1))
|
||||
else:
|
||||
chosen_action = top_action
|
||||
chosen_action = np.argmax(action_values)
|
||||
|
||||
# one-hot probabilities vector
|
||||
probabilities = np.zeros(len(self.action_space.actions))
|
||||
probabilities[chosen_action] = 1
|
||||
|
||||
self.step_epsilon()
|
||||
return chosen_action, probabilities
|
||||
|
||||
else:
|
||||
if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN:
|
||||
chosen_action = self.action_space.sample()
|
||||
else:
|
||||
chosen_action = self.continuous_exploration_policy.get_action(action_values)
|
||||
|
||||
# step the epsilon schedule and generate a new random value for next time
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.epsilon_schedule.step()
|
||||
self.current_random_value = np.random.rand()
|
||||
return chosen_action
|
||||
self.step_epsilon()
|
||||
return chosen_action
|
||||
|
||||
def get_control_param(self):
|
||||
if isinstance(self.action_space, DiscreteActionSpace):
|
||||
@@ -113,3 +118,9 @@ class EGreedy(ExplorationPolicy):
|
||||
super().change_phase(phase)
|
||||
if isinstance(self.action_space, BoxActionSpace):
|
||||
self.continuous_exploration_policy.change_phase(phase)
|
||||
|
||||
def step_epsilon(self):
|
||||
# step the epsilon schedule and generate a new random value for next time
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
self.epsilon_schedule.step()
|
||||
self.current_random_value = np.random.rand()
|
||||
|
||||
@@ -18,7 +18,7 @@ from typing import List
|
||||
|
||||
from rl_coach.base_parameters import Parameters
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.spaces import ActionSpace
|
||||
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace, GoalsSpace
|
||||
|
||||
|
||||
class ExplorationParameters(Parameters):
|
||||
@@ -54,14 +54,10 @@ class ExplorationPolicy(object):
|
||||
Given a list of values corresponding to each action,
|
||||
choose one actions according to the exploration policy
|
||||
:param action_values: A list of action values
|
||||
:return: The chosen action
|
||||
:return: The chosen action,
|
||||
The probability of the action (if available, otherwise 1 for absolute certainty in the action)
|
||||
"""
|
||||
if self.__class__ == ExplorationPolicy:
|
||||
raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. "
|
||||
"Please set the exploration parameters to point to an inheriting class like EGreedy or "
|
||||
"AdditiveNoise")
|
||||
else:
|
||||
raise ValueError("The get_action function should be overridden in the inheriting exploration class")
|
||||
raise NotImplementedError()
|
||||
|
||||
def change_phase(self, phase):
|
||||
"""
|
||||
@@ -82,3 +78,42 @@ class ExplorationPolicy(object):
|
||||
|
||||
def get_control_param(self):
|
||||
return 0
|
||||
|
||||
|
||||
class DiscreteActionExplorationPolicy(ExplorationPolicy):
|
||||
"""
|
||||
A discrete action exploration policy.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
"""
|
||||
assert isinstance(action_space, DiscreteActionSpace)
|
||||
super().__init__(action_space)
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> (ActionType, List):
|
||||
"""
|
||||
Given a list of values corresponding to each action,
|
||||
choose one actions according to the exploration policy
|
||||
:param action_values: A list of action values
|
||||
:return: The chosen action,
|
||||
The probabilities of actions to select from (if not available a one-hot vector)
|
||||
"""
|
||||
if self.__class__ == ExplorationPolicy:
|
||||
raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. "
|
||||
"Please set the exploration parameters to point to an inheriting class like EGreedy or "
|
||||
"AdditiveNoise")
|
||||
else:
|
||||
raise ValueError("The get_action function should be overridden in the inheriting exploration class")
|
||||
|
||||
|
||||
class ContinuousActionExplorationPolicy(ExplorationPolicy):
|
||||
"""
|
||||
A continuous action exploration policy.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
"""
|
||||
assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)
|
||||
super().__init__(action_space)
|
||||
|
||||
@@ -19,7 +19,7 @@ from typing import List
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.core_types import ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy
|
||||
from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
|
||||
|
||||
|
||||
@@ -41,9 +41,12 @@ class Greedy(ExplorationPolicy):
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
def get_action(self, action_values: List[ActionType]):
|
||||
if type(self.action_space) == DiscreteActionSpace:
|
||||
return np.argmax(action_values)
|
||||
action = np.argmax(action_values)
|
||||
one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
|
||||
one_hot_action_probabilities[action] = 1
|
||||
return action, one_hot_action_probabilities
|
||||
if type(self.action_space) == BoxActionSpace:
|
||||
return action_values
|
||||
|
||||
|
||||
@@ -19,12 +19,13 @@ from typing import List
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace
|
||||
|
||||
|
||||
# Based on on the description in:
|
||||
# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
|
||||
|
||||
class OUProcessParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -39,7 +40,7 @@ class OUProcessParameters(ExplorationParameters):
|
||||
|
||||
|
||||
# Ornstein-Uhlenbeck process
|
||||
class OUProcess(ExplorationPolicy):
|
||||
class OUProcess(ContinuousActionExplorationPolicy):
|
||||
"""
|
||||
OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
|
||||
an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
|
||||
@@ -56,10 +57,6 @@ class OUProcess(ExplorationPolicy):
|
||||
self.state = np.zeros(self.action_space.shape)
|
||||
self.dt = dt
|
||||
|
||||
if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)):
|
||||
raise ValueError("OU process exploration works only for continuous controls."
|
||||
"The given action space is of type: {}".format(action_space.__class__.__name__))
|
||||
|
||||
def reset(self):
|
||||
self.state = np.zeros(self.action_space.shape)
|
||||
|
||||
|
||||
@@ -59,9 +59,13 @@ class ParameterNoise(ExplorationPolicy):
|
||||
self.network_params = network_params
|
||||
self._replace_network_dense_layers()
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
def get_action(self, action_values: List[ActionType]):
|
||||
if type(self.action_space) == DiscreteActionSpace:
|
||||
return np.argmax(action_values)
|
||||
action = np.argmax(action_values)
|
||||
one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
|
||||
one_hot_action_probabilities[action] = 1
|
||||
|
||||
return action, one_hot_action_probabilities
|
||||
elif type(self.action_space) == BoxActionSpace:
|
||||
action_values_mean = action_values[0].squeeze()
|
||||
action_values_std = action_values[1].squeeze()
|
||||
|
||||
@@ -20,7 +20,7 @@ import numpy as np
|
||||
from scipy.stats import truncnorm
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
|
||||
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ContinuousActionExplorationPolicy
|
||||
from rl_coach.schedules import Schedule, LinearSchedule
|
||||
from rl_coach.spaces import ActionSpace, BoxActionSpace
|
||||
|
||||
@@ -38,7 +38,7 @@ class TruncatedNormalParameters(ExplorationParameters):
|
||||
return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal'
|
||||
|
||||
|
||||
class TruncatedNormal(ExplorationPolicy):
|
||||
class TruncatedNormal(ContinuousActionExplorationPolicy):
|
||||
"""
|
||||
The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
|
||||
normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t
|
||||
|
||||
Reference in New Issue
Block a user