Create a dataset using an agent (#306)

Generate a dataset using an agent (allowing to select between this and a random dataset)
2026-02-14 12:55:51 +01:00 · 2019-05-28 09:34:49 +03:00
parent 342b7184bc
commit 9e9c4fd332
26 changed files with 351 additions and 111 deletions
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -685,7 +685,10 @@ class Agent(AgentInterface):
        """
        loss = 0
        if self._should_train():
-            self.training_epoch += 1
+            if self.ap.is_batch_rl_training:
+                # when training an agent for generating a dataset in batch-rl, we don't want it to be counted as part of
+                # the training epochs. we only care for training epochs in batch-rl anyway.
+                self.training_epoch += 1
            for network in self.networks.values():
                network.set_is_training(True)

@@ -1047,3 +1050,11 @@ class Agent(AgentInterface):
                TimeTypes.EnvironmentSteps: self.total_steps_counter,
                TimeTypes.WallClockTime: self.agent_logger.get_current_wall_clock_time(),
                TimeTypes.Epoch: self.training_epoch}[self.parent_level_manager.parent_graph_manager.time_metric]
+
+    def freeze_memory(self):
+        """
+        Shuffle episodes in the memory and freeze it to make sure that no extra data is being pushed anymore.
+        :return: None
+        """
+        self.call_memory('shuffle_episodes')
+        self.call_memory('freeze')
--- a/rl_coach/agents/categorical_dqn_agent.py
+++ b/rl_coach/agents/categorical_dqn_agent.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 from typing import Union

 import numpy as np
@@ -83,13 +82,22 @@ class CategoricalDQNAgent(ValueOptimizationAgent):

    # prediction's format is (batch,actions,atoms)
    def get_all_q_values_for_states(self, states: StateType):
+        q_values = None
        if self.exploration_policy.requires_action_values():
            q_values = self.get_prediction(states,
                                           outputs=[self.networks['main'].online_network.output_heads[0].q_values])
-        else:
-            q_values = None
+
        return q_values

+    def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType):
+        actions_q_values, softmax_probabilities = None, None
+        if self.exploration_policy.requires_action_values():
+            outputs = [self.networks['main'].online_network.output_heads[0].q_values,
+                       self.networks['main'].online_network.output_heads[0].softmax]
+            actions_q_values, softmax_probabilities = self.get_prediction(states, outputs=outputs)
+
+        return actions_q_values, softmax_probabilities
+
    def learn_from_batch(self, batch):
        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()

--- a/rl_coach/agents/dfp_agent.py
+++ b/rl_coach/agents/dfp_agent.py
@@ -182,7 +182,7 @@ class DFPAgent(Agent):
            action_values = None

        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
-        action = self.exploration_policy.get_action(action_values)
+        action, _ = self.exploration_policy.get_action(action_values)

        if action_values is not None:
            action_values = action_values.squeeze()
--- a/rl_coach/agents/dqn_agent.py
+++ b/rl_coach/agents/dqn_agent.py
@@ -49,6 +49,7 @@ class DQNNetworkParameters(NetworkParameters):
        self.batch_size = 32
        self.replace_mse_with_huber_loss = True
        self.create_target_network = True
+        self.should_get_softmax_probabilities = False


 class DQNAgentParameters(AgentParameters):
--- a/rl_coach/agents/nec_agent.py
+++ b/rl_coach/agents/nec_agent.py
@@ -16,7 +16,7 @@

 import os
 import pickle
-from typing import Union
+from typing import Union, List

 import numpy as np

@@ -40,6 +40,7 @@ class NECNetworkParameters(NetworkParameters):
        self.middleware_parameters = FCMiddlewareParameters()
        self.heads_parameters = [DNDQHeadParameters()]
        self.optimizer_type = 'Adam'
+        self.should_get_softmax_probabilities = False


 class NECAlgorithmParameters(AlgorithmParameters):
@@ -166,11 +167,25 @@ class NECAgent(ValueOptimizationAgent):

        return super().act()

-    def get_all_q_values_for_states(self, states: StateType):
+    def get_all_q_values_for_states(self, states: StateType, additional_outputs: List = None):
        # we need to store the state embeddings regardless if the action is random or not
-        return self.get_prediction(states)
+        return self.get_prediction_and_update_embeddings(states)

-    def get_prediction(self, states):
+    def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType):
+        # get the actions q values and the state embedding
+        embedding, actions_q_values, softmax_probabilities = self.networks['main'].online_network.predict(
+            self.prepare_batch_for_inference(states, 'main'),
+            outputs=[self.networks['main'].online_network.state_embedding,
+                     self.networks['main'].online_network.output_heads[0].output,
+                     self.networks['main'].online_network.output_heads[0].softmax]
+        )
+        if self.phase != RunPhase.TEST:
+            # store the state embedding for inserting it to the DND later
+            self.current_episode_state_embeddings.append(embedding.squeeze())
+        actions_q_values = actions_q_values[0][0]
+        return actions_q_values, softmax_probabilities
+
+    def get_prediction_and_update_embeddings(self, states):
        # get the actions q values and the state embedding
        embedding, actions_q_values = self.networks['main'].online_network.predict(
            self.prepare_batch_for_inference(states, 'main'),
--- a/rl_coach/agents/policy_optimization_agent.py
+++ b/rl_coach/agents/policy_optimization_agent.py
@@ -147,7 +147,7 @@ class PolicyOptimizationAgent(Agent):
        if isinstance(self.spaces.action, DiscreteActionSpace):
            # DISCRETE
            action_probabilities = np.array(action_values).squeeze()
-            action = self.exploration_policy.get_action(action_probabilities)
+            action, _ = self.exploration_policy.get_action(action_probabilities)
            action_info = ActionInfo(action=action,
                                     all_action_probabilities=action_probabilities)

--- a/rl_coach/agents/qr_dqn_agent.py
+++ b/rl_coach/agents/qr_dqn_agent.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+from copy import copy
 from typing import Union

 import numpy as np
@@ -79,6 +79,17 @@ class QuantileRegressionDQNAgent(ValueOptimizationAgent):
            actions_q_values = None
        return actions_q_values

+    # prediction's format is (batch,actions,atoms)
+    def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType):
+        actions_q_values, softmax_probabilities = None, None
+        if self.exploration_policy.requires_action_values():
+            outputs = copy(self.networks['main'].online_network.outputs)
+            outputs.append(self.networks['main'].online_network.output_heads[0].softmax)
+            quantile_values, softmax_probabilities = self.get_prediction(states, outputs)
+            actions_q_values = self.get_q_values(quantile_values)
+
+        return actions_q_values, softmax_probabilities
+
    def learn_from_batch(self, batch):
        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()

--- a/rl_coach/agents/value_optimization_agent.py
+++ b/rl_coach/agents/value_optimization_agent.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 from collections import OrderedDict
-from typing import Union
+from typing import Union, List

 import numpy as np

@@ -24,7 +24,8 @@ from rl_coach.filters.filter import NoInputFilter
 from rl_coach.logger import screen
 from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay
 from rl_coach.spaces import DiscreteActionSpace
-from copy import deepcopy
+from copy import deepcopy, copy
+

 ## This is an abstract agent - there is no learn_from_batch method ##

@@ -35,6 +36,12 @@ class ValueOptimizationAgent(Agent):
        self.q_values = self.register_signal("Q")
        self.q_value_for_action = {}

+        # currently we use softmax action probabilities only in batch-rl,
+        # but we might want to extend this later at some point.
+        self.should_get_softmax_probabilities = \
+            hasattr(self.ap.network_wrappers['main'], 'should_get_softmax_probabilities') and  \
+            self.ap.network_wrappers['main'].should_get_softmax_probabilities
+
    def init_environment_dependent_modules(self):
        super().init_environment_dependent_modules()
        if isinstance(self.spaces.action, DiscreteActionSpace):
@@ -45,12 +52,21 @@ class ValueOptimizationAgent(Agent):

    # Algorithms for which q_values are calculated from predictions will override this function
    def get_all_q_values_for_states(self, states: StateType):
+        actions_q_values = None
        if self.exploration_policy.requires_action_values():
            actions_q_values = self.get_prediction(states)
-        else:
-            actions_q_values = None
+
        return actions_q_values

+    def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType):
+        actions_q_values, softmax_probabilities = None, None
+        if self.exploration_policy.requires_action_values():
+            outputs = copy(self.networks['main'].online_network.outputs)
+            outputs.append(self.networks['main'].online_network.output_heads[0].softmax)
+
+            actions_q_values, softmax_probabilities = self.get_prediction(states, outputs=outputs)
+        return actions_q_values, softmax_probabilities
+
    def get_prediction(self, states, outputs=None):
        return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'),
                                                            outputs=outputs)
@@ -72,10 +88,19 @@ class ValueOptimizationAgent(Agent):
            ).format(policy.__class__.__name__))

    def choose_action(self, curr_state):
-        actions_q_values = self.get_all_q_values_for_states(curr_state)
+        if self.should_get_softmax_probabilities:
+            actions_q_values, softmax_probabilities = \
+                self.get_all_q_values_for_states_and_softmax_probabilities(curr_state)
+        else:
+            actions_q_values = self.get_all_q_values_for_states(curr_state)

        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
-        action = self.exploration_policy.get_action(actions_q_values)
+        action, action_probabilities = self.exploration_policy.get_action(actions_q_values)
+        if self.should_get_softmax_probabilities and softmax_probabilities is not None:
+            # override the exploration policy's generated probabilities when an action was taken
+            # with the agent's actual policy
+            action_probabilities = softmax_probabilities
+
        self._validate_action(self.exploration_policy, action)

        if actions_q_values is not None:
@@ -87,15 +112,18 @@ class ValueOptimizationAgent(Agent):
            self.q_values.add_sample(actions_q_values)

            actions_q_values = actions_q_values.squeeze()
+            action_probabilities = action_probabilities.squeeze()

            for i, q_value in enumerate(actions_q_values):
                self.q_value_for_action[i].add_sample(q_value)

            action_info = ActionInfo(action=action,
                                     action_value=actions_q_values[action],
-                                     max_action_value=np.max(actions_q_values))
+                                     max_action_value=np.max(actions_q_values),
+                                     all_action_probabilities=action_probabilities)
+
        else:
-            action_info = ActionInfo(action=action)
+            action_info = ActionInfo(action=action, all_action_probabilities=action_probabilities)

        return action_info