pre-release 0.10.0

2026-02-15 05:25:55 +01:00 · 2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions
--- a/rl_coach/level_manager.py
+++ b/rl_coach/level_manager.py
@@ -0,0 +1,258 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import copy
+from typing import Union, Dict, Tuple, Type
+
+from rl_coach.environments.environment import Environment
+from rl_coach.environments.environment_interface import EnvironmentInterface
+from rl_coach.spaces import ActionSpace, SpacesDefinition
+
+from rl_coach.agents.composite_agent import CompositeAgent
+from rl_coach.core_types import EnvResponse, ActionInfo, RunPhase, ActionType, EnvironmentSteps
+
+
+class LevelManager(EnvironmentInterface):
+    """
+    The LevelManager is in charge of managing a level in the hierarchy of control. Each level can have one or more
+    CompositeAgents and an environment to control. Its API is double-folded:
+        1. Expose services of a LevelManager such as training the level, or stepping it (while behaving according to a
+           LevelBehaviorScheme, e.g. as SelfPlay between two identical agents). These methods are implemented in the
+           LevelManagerLogic class.
+        2. Disguise as appearing as an environment to the upper level control so it will believe it is interacting with
+           an environment. This includes stepping through what appears to be a regular environment, setting its phase
+           or resetting it. These methods are implemented directly in LevelManager as it inherits from
+           EnvironmentInterface.
+    """
+    def __init__(self,
+                 name: str,
+                 agents: Union['Agent', CompositeAgent, Dict[str, Union['Agent', CompositeAgent]]],
+                 environment: Union['LevelManager', Environment],
+                 real_environment: Environment=None,
+                 steps_limit: EnvironmentSteps=EnvironmentSteps(1),
+                 should_reset_agent_state_after_time_limit_passes: bool=False
+                 ):
+        """
+        A level manager controls a single or multiple composite agents and a single environment.
+        The environment can be either a real environment or another level manager behaving as an environment.
+        :param agents: a list of agents or composite agents to control
+        :param environment: an environment or level manager to control
+        :param real_environment: the real environment that is is acted upon. if this is None (which it should be for
+         the most bottom level), it will be replaced by the environment parameter. For simple RL schemes, where there
+         is only a single level of hierarchy, this removes the requirement of defining both the environment and the
+         real environment, as they are the same.
+        :param steps_limit: the number of time steps to run when stepping the internal components
+        :param should_reset_agent_state_after_time_limit_passes: reset the agent after stepping for steps_limit
+        :param name: the level's name
+        """
+        super().__init__()
+
+        if not isinstance(agents, dict):
+            # insert the single composite agent to a dictionary for compatibility
+            agents = {agents.name: agents}
+        if real_environment is None:
+            self._real_environment = real_environment = environment
+        self.agents = agents
+        self.environment = environment
+        self.real_environment = real_environment
+        self.steps_limit = steps_limit
+        self.should_reset_agent_state_after_time_limit_passes = should_reset_agent_state_after_time_limit_passes
+        self.full_name_id = self.name = name
+        self._phase = RunPhase.HEATUP
+        self.level_was_reset = True
+
+        # set self as the parent for all the composite agents
+        for agent in self.agents.values():
+            agent.parent = self
+            agent.parent_level_manager = self
+
+        # create all agents in all composite_agents - we do it here so agents will have access to their level manager
+        for agent in self.agents.values():
+            if isinstance(agent, CompositeAgent):
+                agent.create_agents()
+
+        if not isinstance(self.steps_limit, EnvironmentSteps):
+            raise ValueError("The num consecutive steps for acting must be defined in terms of environment steps")
+        self.build()
+
+        self.last_env_response = self.real_environment.last_env_response
+        self.parent_graph_manager = None
+
+    def handle_episode_ended(self) -> None:
+        """
+        End the environment episode
+        :return: None
+        """
+        [agent.handle_episode_ended() for agent in self.agents.values()]
+
+    def reset_internal_state(self, force_environment_reset: bool = False) -> EnvResponse:
+        """
+        Reset the environment episode parameters
+        :param force_enviro nment_reset: in some cases, resetting the environment can be suppressed by the environment
+                                        itself. This flag allows force the reset.
+        :return: the environment response as returned in get_last_env_response
+        """
+        [agent.reset_internal_state() for agent in self.agents.values()]
+        self.level_was_reset = True
+        if self.real_environment.current_episode_steps_counter == 0:
+            self.last_env_response = self.real_environment.last_env_response
+        return self.last_env_response
+
+    @property
+    def action_space(self) -> Dict[str, ActionSpace]:
+        """
+        Get the action space of each of the agents wrapped in this environment.
+        :return: the action space
+        """
+        cagents_dict = self.agents
+        cagents_names = cagents_dict.keys()
+
+        return {name: cagents_dict[name].in_action_space for name in cagents_names}
+
+    def get_random_action(self) -> Dict[str, ActionType]:
+        """
+        Get a random action from the environment action space
+        :return: An action that follows the definition of the action space.
+        """
+        action_spaces = self.action_space  # The action spaces of the abstracted composite agents in this level
+        return {name: action_space.sample() for name, action_space in action_spaces.items()}
+
+    def get_random_action_with_info(self) -> Dict[str, ActionInfo]:
+        """
+        Get a random action from the environment action space and wrap it with additional info
+        :return: An action that follows the definition of the action space with additional generated info.
+        """
+        return {k: ActionInfo(v) for k, v in self.get_random_action().items()}
+
+    def build(self) -> None:
+        """
+        Build all the internal components of the level manager (composite agents and environment).
+        :return: None
+        """
+        # TODO: move the spaces definition class to the environment?
+        action_space = self.environment.action_space
+        if isinstance(action_space, dict):  # TODO: shouldn't be a dict
+            action_space = list(action_space.values())[0]
+        spaces = SpacesDefinition(state=self.real_environment.state_space,
+                                  goal=self.real_environment.goal_space,  # in HRL the agent might want to override this
+                                  action=action_space,
+                                  reward=self.real_environment.reward_space)
+        [agent.set_environment_parameters(spaces) for agent in self.agents.values()]
+
+    def setup_logger(self) -> None:
+        """
+        Setup the logger for all the agents in the level
+        :return: None
+        """
+        [agent.setup_logger() for agent in self.agents.values()]
+
+    def set_session(self, sess) -> None:
+        """
+        Set the deep learning framework session for all the composite agents in the level manager
+        :return: None
+        """
+        [agent.set_session(sess) for agent in self.agents.values()]
+
+    def train(self) -> None:
+        """
+        Make a training step for all the composite agents in this level manager
+        :return: the loss?
+        """
+        # both to screen and to csv
+        [agent.train() for agent in self.agents.values()]
+
+    @property
+    def phase(self) -> RunPhase:
+        """
+        Get the phase of the level manager
+        :return: the current phase
+        """
+        return self._phase
+
+    @phase.setter
+    def phase(self, val: RunPhase):
+        """
+        Change the phase of the level manager and all the hierarchy levels below it
+        :param val: the new phase
+        :return: None
+        """
+        self._phase = val
+        for agent in self.agents.values():
+            agent.phase = val
+
+    def step(self, action: Union[None, Dict[str, ActionType]]) -> EnvResponse:
+        """
+        Run a single step of following the behavioral scheme set for this environment.
+        :param action: the action to apply to the agents held in this level, before beginning following
+                       the scheme.
+        :return: None
+        """
+        # set the incoming directive for the sub-agent (goal / skill selection / etc.)
+        if action is not None:
+            for agent_name, agent in self.agents.items():
+                agent.set_incoming_directive(action)
+
+        # get last response or initial response from the environment
+        env_response = copy.copy(self.environment.last_env_response)
+
+        # step for several time steps
+        accumulated_reward = 0
+        acting_agent = list(self.agents.values())[0]
+
+        for i in range(self.steps_limit.num_steps):
+            # let the agent observe the result and decide if it wants to terminate the episode
+            done = acting_agent.observe(env_response)
+
+            if done:
+                break
+            else:
+                # get action
+                action_info = acting_agent.act()
+
+                # step environment
+                env_response = self.environment.step(action_info.action)
+
+                # accumulate rewards such that the master policy will see the total reward during the step phase
+                accumulated_reward += env_response.reward
+
+        # update the env response that will be exposed to the parent agent
+        env_response_for_upper_level = copy.copy(env_response)
+        env_response_for_upper_level.reward = accumulated_reward
+        self.last_env_response = env_response_for_upper_level
+
+        # if the environment terminated the episode -> let the agent observe the last response
+        # in HRL,excluding top level one, we will always enter the below if clause
+        # (because should_reset_agent_state_after_time_limit_passes is set to True)
+        if env_response.game_over or self.should_reset_agent_state_after_time_limit_passes:
+            # this is the agent's only opportunity to observe this transition - he will not get another one
+            acting_agent.observe(env_response)  # TODO: acting agent? maybe all of the agents in the layer?
+            self.handle_episode_ended()
+            self.reset_internal_state()
+
+        return env_response_for_upper_level
+
+    def save_checkpoint(self, checkpoint_id: int) -> None:
+        """
+        Save checkpoints of the networks of all agents
+        :return: None
+        """
+        [agent.save_checkpoint(checkpoint_id) for agent in self.agents.values()]
+
+    def sync(self) -> None:
+        """
+        Sync the networks of the agents with the global network parameters
+        :return:
+        """
+        [agent.sync() for agent in self.agents.values()]