Create a dataset using an agent (#306)

Generate a dataset using an agent (allowing to select between this and a random dataset)
2026-07-09 02:46:33 +02:00 · 2019-05-28 09:34:49 +03:00
parent 342b7184bc
commit 9e9c4fd332
26 changed files with 351 additions and 111 deletions
@@ -17,14 +17,17 @@
 from typing import List

 import numpy as np
+import scipy.stats

 from rl_coach.core_types import RunPhase, ActionType
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
 from rl_coach.schedules import Schedule, LinearSchedule
 from rl_coach.spaces import ActionSpace, BoxActionSpace


 # TODO: consider renaming to gaussian sampling
+
+
 class AdditiveNoiseParameters(ExplorationParameters):
    def __init__(self):
        super().__init__()
@@ -36,7 +39,7 @@ class AdditiveNoiseParameters(ExplorationParameters):
        return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise'


-class AdditiveNoise(ExplorationPolicy):
+class AdditiveNoise(ContinuousActionExplorationPolicy):
    """
    AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
    and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
@@ -19,7 +19,7 @@ from typing import List
 import numpy as np

 from rl_coach.core_types import RunPhase, ActionType
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters
 from rl_coach.schedules import Schedule
 from rl_coach.spaces import ActionSpace

@@ -34,8 +34,7 @@ class BoltzmannParameters(ExplorationParameters):
        return 'rl_coach.exploration_policies.boltzmann:Boltzmann'


-
-class Boltzmann(ExplorationPolicy):
+class Boltzmann(DiscreteActionExplorationPolicy):
    """
    The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
    actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
@@ -50,7 +49,7 @@ class Boltzmann(ExplorationPolicy):
        super().__init__(action_space)
        self.temperature_schedule = temperature_schedule

-    def get_action(self, action_values: List[ActionType]) -> ActionType:
+    def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
        if self.phase == RunPhase.TRAIN:
            self.temperature_schedule.step()
        # softmax calculation
@@ -59,7 +58,8 @@ class Boltzmann(ExplorationPolicy):
        # make sure probs sum to 1
        probabilities[-1] = 1 - np.sum(probabilities[:-1])
        # choose actions according to the probabilities
-        return np.random.choice(range(self.action_space.shape), p=probabilities)
+        action = np.random.choice(range(self.action_space.shape), p=probabilities)
+        return action, probabilities

    def get_control_param(self):
        return self.temperature_schedule.current_value
@@ -19,7 +19,7 @@ from typing import List
 import numpy as np

 from rl_coach.core_types import RunPhase, ActionType
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters
 from rl_coach.spaces import ActionSpace


@@ -29,7 +29,7 @@ class CategoricalParameters(ExplorationParameters):
        return 'rl_coach.exploration_policies.categorical:Categorical'


-class Categorical(ExplorationPolicy):
+class Categorical(DiscreteActionExplorationPolicy):
    """
    Categorical exploration policy is intended for discrete action spaces. It expects the action values to
    represent a probability distribution over the action, from which a single action will be sampled.
@@ -42,13 +42,18 @@ class Categorical(ExplorationPolicy):
        """
        super().__init__(action_space)

-    def get_action(self, action_values: List[ActionType]) -> ActionType:
+    def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
        if self.phase == RunPhase.TRAIN:
            # choose actions according to the probabilities
-            return np.random.choice(self.action_space.actions, p=action_values)
+            action = np.random.choice(self.action_space.actions, p=action_values)
+            return action, action_values
        else:
            # take the action with the highest probability
-            return np.argmax(action_values)
+            action = np.argmax(action_values)
+            one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
+            one_hot_action_probabilities[action] = 1
+
+            return action, one_hot_action_probabilities

    def get_control_param(self):
        return 0
@@ -20,8 +20,7 @@ import numpy as np

 from rl_coach.core_types import RunPhase, ActionType
 from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
-from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy
+from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy
 from rl_coach.schedules import Schedule, LinearSchedule
 from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
 from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
@@ -82,26 +81,32 @@ class EGreedy(ExplorationPolicy):
        epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
        return self.current_random_value >= epsilon

-    def get_action(self, action_values: List[ActionType]) -> ActionType:
+    def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]):
        epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value

        if isinstance(self.action_space, DiscreteActionSpace):
-            top_action = np.argmax(action_values)
            if self.current_random_value < epsilon:
                chosen_action = self.action_space.sample()
+                probabilities = np.full(len(self.action_space.actions),
+                                      1. / (self.action_space.high[0] - self.action_space.low[0] + 1))
            else:
-                chosen_action = top_action
+                chosen_action = np.argmax(action_values)
+
+                # one-hot probabilities vector
+                probabilities = np.zeros(len(self.action_space.actions))
+                probabilities[chosen_action] = 1
+
+            self.step_epsilon()
+            return chosen_action, probabilities
+
        else:
            if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN:
                chosen_action = self.action_space.sample()
            else:
                chosen_action = self.continuous_exploration_policy.get_action(action_values)

-        # step the epsilon schedule and generate a new random value for next time
-        if self.phase == RunPhase.TRAIN:
-            self.epsilon_schedule.step()
-        self.current_random_value = np.random.rand()
-        return chosen_action
+            self.step_epsilon()
+            return chosen_action

    def get_control_param(self):
        if isinstance(self.action_space, DiscreteActionSpace):
@@ -113,3 +118,9 @@ class EGreedy(ExplorationPolicy):
        super().change_phase(phase)
        if isinstance(self.action_space, BoxActionSpace):
            self.continuous_exploration_policy.change_phase(phase)
+
+    def step_epsilon(self):
+        # step the epsilon schedule and generate a new random value for next time
+        if self.phase == RunPhase.TRAIN:
+            self.epsilon_schedule.step()
+        self.current_random_value = np.random.rand()
@@ -18,7 +18,7 @@ from typing import List

 from rl_coach.base_parameters import Parameters
 from rl_coach.core_types import RunPhase, ActionType
-from rl_coach.spaces import ActionSpace
+from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace, GoalsSpace


 class ExplorationParameters(Parameters):
@@ -54,14 +54,10 @@ class ExplorationPolicy(object):
        Given a list of values corresponding to each action, 
        choose one actions according to the exploration policy
        :param action_values: A list of action values
-        :return: The chosen action
+        :return: The chosen action,
+                 The probability of the action (if available, otherwise 1 for absolute certainty in the action)
        """
-        if self.__class__ == ExplorationPolicy:
-            raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. "
-                             "Please set the exploration parameters to point to an inheriting class like EGreedy or "
-                             "AdditiveNoise")
-        else:
-            raise ValueError("The get_action function should be overridden in the inheriting exploration class")
+        raise NotImplementedError()

    def change_phase(self, phase):
        """
@@ -82,3 +78,42 @@ class ExplorationPolicy(object):

    def get_control_param(self):
        return 0
+
+
+class DiscreteActionExplorationPolicy(ExplorationPolicy):
+    """
+    A discrete action exploration policy.
+    """
+    def __init__(self, action_space: ActionSpace):
+        """
+        :param action_space: the action space used by the environment
+        """
+        assert isinstance(action_space, DiscreteActionSpace)
+        super().__init__(action_space)
+
+    def get_action(self, action_values: List[ActionType]) -> (ActionType, List):
+        """
+        Given a list of values corresponding to each action,
+        choose one actions according to the exploration policy
+        :param action_values: A list of action values
+        :return: The chosen action,
+                 The probabilities of actions to select from (if not available a one-hot vector)
+        """
+        if self.__class__ == ExplorationPolicy:
+            raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. "
+                             "Please set the exploration parameters to point to an inheriting class like EGreedy or "
+                             "AdditiveNoise")
+        else:
+            raise ValueError("The get_action function should be overridden in the inheriting exploration class")
+
+
+class ContinuousActionExplorationPolicy(ExplorationPolicy):
+    """
+    A continuous action exploration policy.
+    """
+    def __init__(self, action_space: ActionSpace):
+        """
+        :param action_space: the action space used by the environment
+        """
+        assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)
+        super().__init__(action_space)
@@ -19,7 +19,7 @@ from typing import List
 import numpy as np

 from rl_coach.core_types import ActionType
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy
 from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace


@@ -41,9 +41,12 @@ class Greedy(ExplorationPolicy):
        """
        super().__init__(action_space)

-    def get_action(self, action_values: List[ActionType]) -> ActionType:
+    def get_action(self, action_values: List[ActionType]):
        if type(self.action_space) == DiscreteActionSpace:
-            return np.argmax(action_values)
+            action = np.argmax(action_values)
+            one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
+            one_hot_action_probabilities[action] = 1
+            return action, one_hot_action_probabilities
        if type(self.action_space) == BoxActionSpace:
            return action_values

@@ -19,12 +19,13 @@ from typing import List
 import numpy as np

 from rl_coach.core_types import RunPhase, ActionType
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
 from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace


 # Based on on the description in:
 # https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
+
 class OUProcessParameters(ExplorationParameters):
    def __init__(self):
        super().__init__()
@@ -39,7 +40,7 @@ class OUProcessParameters(ExplorationParameters):


 # Ornstein-Uhlenbeck process
-class OUProcess(ExplorationPolicy):
+class OUProcess(ContinuousActionExplorationPolicy):
    """
    OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
    an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
@@ -56,10 +57,6 @@ class OUProcess(ExplorationPolicy):
        self.state = np.zeros(self.action_space.shape)
        self.dt = dt

-        if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)):
-            raise ValueError("OU process exploration works only for continuous controls."
-                             "The given action space is of type: {}".format(action_space.__class__.__name__))
-
    def reset(self):
        self.state = np.zeros(self.action_space.shape)

@@ -59,9 +59,13 @@ class ParameterNoise(ExplorationPolicy):
        self.network_params = network_params
        self._replace_network_dense_layers()

-    def get_action(self, action_values: List[ActionType]) -> ActionType:
+    def get_action(self, action_values: List[ActionType]):
        if type(self.action_space) == DiscreteActionSpace:
-            return np.argmax(action_values)
+            action = np.argmax(action_values)
+            one_hot_action_probabilities = np.zeros(len(self.action_space.actions))
+            one_hot_action_probabilities[action] = 1
+
+            return action, one_hot_action_probabilities
        elif type(self.action_space) == BoxActionSpace:
            action_values_mean = action_values[0].squeeze()
            action_values_std = action_values[1].squeeze()
@@ -20,7 +20,7 @@ import numpy as np
 from scipy.stats import truncnorm

 from rl_coach.core_types import RunPhase, ActionType
-from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ContinuousActionExplorationPolicy
 from rl_coach.schedules import Schedule, LinearSchedule
 from rl_coach.spaces import ActionSpace, BoxActionSpace

@@ -38,7 +38,7 @@ class TruncatedNormalParameters(ExplorationParameters):
        return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal'


-class TruncatedNormal(ExplorationPolicy):
+class TruncatedNormal(ContinuousActionExplorationPolicy):
    """
    The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
    normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t