pre-release 0.10.0

2026-07-09 10:56:33 +02:00 · 2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions
@@ -0,0 +1,318 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Tuple, Union, Dict, Any
+
+import numpy as np
+from rl_coach.utils import ReaderWriterLock
+
+from rl_coach.core_types import Transition, Episode
+from rl_coach.memories.memory import Memory, MemoryGranularity, MemoryParameters
+
+
+class EpisodicExperienceReplayParameters(MemoryParameters):
+    def __init__(self):
+        super().__init__()
+        self.max_size = (MemoryGranularity.Transitions, 1000000)
+
+    @property
+    def path(self):
+        return 'rl_coach.memories.episodic.episodic_experience_replay:EpisodicExperienceReplay'
+
+
+class EpisodicExperienceReplay(Memory):
+    """
+    A replay buffer that stores episodes of transitions. The additional structure allows performing various
+    calculations of total return and other values that depend on the sequential behavior of the transitions
+    in the episode.
+    """
+    def __init__(self, max_size: Tuple[MemoryGranularity, int]):
+        """
+        :param max_size: the maximum number of transitions or episodes to hold in the memory
+        """
+        super().__init__(max_size)
+
+        self._buffer = [Episode()]  # list of episodes
+        self.transitions = []
+        self._length = 1  # the episodic replay buffer starts with a single empty episode
+        self._num_transitions = 0
+        self._num_transitions_in_complete_episodes = 0
+
+        self.reader_writer_lock = ReaderWriterLock()
+
+    def length(self, lock: bool=False) -> int:
+        """
+        Get the number of episodes in the ER (even if they are not complete)
+        """
+        length = self._length
+        if self._length is not 0 and self._buffer[-1].is_empty():
+            length = self._length - 1
+
+        return length
+
+    def num_complete_episodes(self):
+        """ Get the number of complete episodes in ER """
+        length = self._length - 1
+
+        return length
+
+    def num_transitions(self):
+        return self._num_transitions
+
+    def num_transitions_in_complete_episodes(self):
+        return self._num_transitions_in_complete_episodes
+
+    def sample(self, size: int) -> List[Transition]:
+        """
+        Sample a batch of transitions form the replay buffer. If the requested size is larger than the number
+        of samples available in the replay buffer then the batch will return empty.
+        :param size: the size of the batch to sample
+        :return: a batch (list) of selected transitions from the replay buffer
+        """
+        self.reader_writer_lock.lock_writing()
+
+        if self.num_complete_episodes() >= 1:
+            transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
+            batch = [self.transitions[i] for i in transitions_idx]
+
+        else:
+            raise ValueError("The episodic replay buffer cannot be sampled since there are no complete episodes yet. "
+                             "There is currently 1 episodes with {} transitions".format(self._buffer[0].length()))
+
+        self.reader_writer_lock.release_writing()
+
+        return batch
+
+    def _enforce_max_length(self) -> None:
+        """
+        Make sure that the size of the replay buffer does not pass the maximum size allowed.
+        If it passes the max size, the oldest episode in the replay buffer will be removed.
+        :return: None
+        """
+        granularity, size = self.max_size
+        if granularity == MemoryGranularity.Transitions:
+            while size != 0 and self.num_transitions() > size:
+                self._remove_episode(0)
+        elif granularity == MemoryGranularity.Episodes:
+            while self.length() > size:
+                self._remove_episode(0)
+
+    def _update_episode(self, episode: Episode) -> None:
+        episode.update_returns()
+
+    def verify_last_episode_is_closed(self) -> None:
+        """
+        Verify that there is no open episodes in the replay buffer
+        :return: None
+        """
+        self.reader_writer_lock.lock_writing_and_reading()
+
+        last_episode = self.get(-1, False)
+        if last_episode and last_episode.length() > 0:
+            self.close_last_episode(lock=False)
+
+        self.reader_writer_lock.release_writing_and_reading()
+
+    def close_last_episode(self, lock=True) -> None:
+        """
+        Close the last episode in the replay buffer and open a new one
+        :return: None
+        """
+        if lock:
+            self.reader_writer_lock.lock_writing_and_reading()
+
+        last_episode = self._buffer[-1]
+
+        self._num_transitions_in_complete_episodes += last_episode.length()
+        self._length += 1
+
+        # create a new Episode for the next transitions to be placed into
+        self._buffer.append(Episode())
+
+        # if update episode adds to the buffer, a new Episode needs to be ready first
+        # it would be better if this were less state full
+        self._update_episode(last_episode)
+
+        self._enforce_max_length()
+
+        if lock:
+            self.reader_writer_lock.release_writing_and_reading()
+
+    def store(self, transition: Transition) -> None:
+        """
+        Store a new transition in the memory. If the transition game_over flag is on, this closes the episode and
+        creates a new empty episode.
+        Warning! using the episodic memory by storing individual transitions instead of episodes will use the default
+        Episode class parameters in order to create new episodes.
+        :param transition: a transition to store
+        :return: None
+        """
+        self.reader_writer_lock.lock_writing_and_reading()
+
+        if len(self._buffer) == 0:
+            self._buffer.append(Episode())
+        last_episode = self._buffer[-1]
+        last_episode.insert(transition)
+        self.transitions.append(transition)
+        self._num_transitions += 1
+        if transition.game_over:
+            self.close_last_episode(False)
+
+        self._enforce_max_length()
+
+        self.reader_writer_lock.release_writing_and_reading()
+
+    def store_episode(self, episode: Episode, lock: bool=True) -> None:
+        """
+        Store a new episode in the memory.
+        :param episode: the new episode to store
+        :return: None
+        """
+        if lock:
+            self.reader_writer_lock.lock_writing_and_reading()
+
+        if self._buffer[-1].length() == 0:
+            self._buffer[-1] = episode
+        else:
+            self._buffer.append(episode)
+        self.transitions.extend(episode.transitions)
+        self._num_transitions += episode.length()
+        self.close_last_episode(False)
+
+        if lock:
+            self.reader_writer_lock.release_writing_and_reading()
+
+    def get_episode(self, episode_index: int, lock: bool=True) -> Union[None, Episode]:
+        """
+        Returns the episode in the given index. If the episode does not exist, returns None instead.
+        :param episode_index: the index of the episode to return
+        :return: the corresponding episode
+        """
+        if lock:
+            self.reader_writer_lock.lock_writing()
+
+        if self.length() == 0 or episode_index >= self.length():
+            episode = None
+        else:
+            episode = self._buffer[episode_index]
+
+        if lock:
+            self.reader_writer_lock.release_writing()
+        return episode
+
+    def _remove_episode(self, episode_index: int) -> None:
+        """
+        Remove the episode in the given index (even if it is not complete yet)
+        :param episode_index: the index of the episode to remove
+        :return: None
+        """
+        if len(self._buffer) > episode_index:
+            episode_length = self._buffer[episode_index].length()
+            self._length -= 1
+            self._num_transitions -= episode_length
+            self._num_transitions_in_complete_episodes -= episode_length
+            del self.transitions[:episode_length]
+            del self._buffer[episode_index]
+
+    def remove_episode(self, episode_index: int) -> None:
+        """
+        Remove the episode in the given index (even if it is not complete yet)
+        :param episode_index: the index of the episode to remove
+        :return: None
+        """
+        self.reader_writer_lock.lock_writing_and_reading()
+
+        self._remove_episode(episode_index)
+
+        self.reader_writer_lock.release_writing_and_reading()
+
+    # for API compatibility
+    def get(self, episode_index: int, lock: bool=True) -> Union[None, Episode]:
+        """
+        Returns the episode in the given index. If the episode does not exist, returns None instead.
+        :param episode_index: the index of the episode to return
+        :return: the corresponding episode
+        """
+        return self.get_episode(episode_index, lock)
+
+    def get_last_complete_episode(self) -> Union[None, Episode]:
+        """
+        Returns the last complete episode in the memory or None if there are no complete episodes
+        :return: None or the last complete episode
+        """
+        self.reader_writer_lock.lock_writing()
+
+        last_complete_episode_index = self.num_complete_episodes() - 1
+        episode = None
+        if last_complete_episode_index >= 0:
+            episode = self.get(last_complete_episode_index)
+
+        self.reader_writer_lock.release_writing()
+
+        return episode
+
+    # for API compatibility
+    def remove(self, episode_index: int):
+        """
+        Remove the episode in the given index (even if it is not complete yet)
+        :param episode_index: the index of the episode to remove
+        :return: None
+        """
+        self.remove_episode(episode_index)
+
+    def update_last_transition_info(self, info: Dict[str, Any]) -> None:
+        """
+        Update the info of the last transition stored in the memory
+        :param info: the new info to append to the existing info
+        :return: None
+        """
+        self.reader_writer_lock.lock_writing_and_reading()
+
+        episode = self._buffer[-1]
+        if episode.length() == 0:
+            if len(self._buffer) < 2:
+                return
+            episode = self._buffer[-2]
+        episode.transitions[-1].info.update(info)
+
+        self.reader_writer_lock.release_writing_and_reading()
+
+    def clean(self) -> None:
+        """
+        Clean the memory by removing all the episodes
+        :return: None
+        """
+        self.reader_writer_lock.lock_writing_and_reading()
+
+        self.transitions = []
+        self._buffer = [Episode()]
+        self._length = 1
+        self._num_transitions = 0
+        self._num_transitions_in_complete_episodes = 0
+
+        self.reader_writer_lock.release_writing_and_reading()
+
+    def mean_reward(self) -> np.ndarray:
+        """
+        Get the mean reward in the replay buffer
+        :return: the mean reward
+        """
+        self.reader_writer_lock.lock_writing()
+
+        mean = np.mean([transition.reward for transition in self.transitions])
+
+        self.reader_writer_lock.release_writing()
+        return mean
@@ -0,0 +1,147 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from enum import Enum
+from typing import Tuple, List
+
+import numpy as np
+
+from rl_coach.core_types import Episode, Transition
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters, EpisodicExperienceReplay
+from rl_coach.memories.non_episodic.experience_replay import MemoryGranularity
+from rl_coach.spaces import GoalsSpace
+
+
+class HindsightGoalSelectionMethod(Enum):
+    Future = 0
+    Final = 1
+    Episode = 2
+    Random = 3
+
+
+class EpisodicHindsightExperienceReplayParameters(EpisodicExperienceReplayParameters):
+    def __init__(self):
+        super().__init__()
+        self.hindsight_transitions_per_regular_transition = None
+        self.hindsight_goal_selection_method = None
+        self.goals_space = None
+
+    @property
+    def path(self):
+        return 'rl_coach.memories.episodic.episodic_hindsight_experience_replay:EpisodicHindsightExperienceReplay'
+
+
+class EpisodicHindsightExperienceReplay(EpisodicExperienceReplay):
+    """
+    Implements Hindsight Experience Replay as described in the following paper: https://arxiv.org/pdf/1707.01495.pdf
+
+    """
+    def __init__(self, max_size: Tuple[MemoryGranularity, int],
+                 hindsight_transitions_per_regular_transition: int,
+                 hindsight_goal_selection_method: HindsightGoalSelectionMethod,
+                 goals_space: GoalsSpace):
+        """
+        :param max_size: The maximum size of the memory. should be defined in a granularity of Transitions
+        :param hindsight_transitions_per_regular_transition: The number of hindsight artificial transitions to generate
+                                                             for each actual transition
+        :param hindsight_goal_selection_method: The method that will be used for generating the goals for the
+                                                hindsight transitions. Should be one of HindsightGoalSelectionMethod
+        :param goals_space: A GoalsSpace which defines the base properties of the goals space
+        """
+        super().__init__(max_size)
+
+        self.hindsight_transitions_per_regular_transition = hindsight_transitions_per_regular_transition
+        self.hindsight_goal_selection_method = hindsight_goal_selection_method
+        self.goals_space = goals_space
+        self.last_episode_start_idx = 0
+
+    def _sample_goal(self, episode_transitions: List, transition_index: int):
+        """
+        Sample a single goal state according to the sampling method
+        :param episode_transitions: a list of all the transitions in the current episode
+        :param transition_index: the transition to start sampling from
+        :return: a goal corresponding to the sampled state
+        """
+        if self.hindsight_goal_selection_method == HindsightGoalSelectionMethod.Future:
+            # states that were observed in the same episode after the transition that is being replayed
+            selected_transition = np.random.choice(episode_transitions[transition_index+1:])
+        elif self.hindsight_goal_selection_method == HindsightGoalSelectionMethod.Final:
+            # the final state in the episode
+            selected_transition = episode_transitions[-1]
+        elif self.hindsight_goal_selection_method == HindsightGoalSelectionMethod.Episode:
+            # a random state from the episode
+            selected_transition = np.random.choice(episode_transitions)
+        elif self.hindsight_goal_selection_method == HindsightGoalSelectionMethod.Random:
+            # a random state from the entire replay buffer
+            selected_transition = np.random.choice(self.transitions)
+        else:
+            raise ValueError("Invalid goal selection method was used for the hindsight goal selection")
+        return self.goals_space.goal_from_state(selected_transition.state)
+
+    def _sample_goals(self, episode_transitions: List, transition_index: int):
+        """
+        Sample a batch of goal states according to the sampling method
+        :param episode_transitions: a list of all the transitions in the current episode
+        :param transition_index: the transition to start sampling from
+        :return: a goal corresponding to the sampled state
+        """
+        return [
+            self._sample_goal(episode_transitions, transition_index)
+            for _ in range(self.hindsight_transitions_per_regular_transition)
+        ]
+
+    def store_episode(self, episode: Episode, lock: bool=True) -> None:
+        # generate hindsight transitions only when an episode is finished
+        last_episode_transitions = copy.copy(episode.transitions)
+
+        # cannot create a future hindsight goal in the last transition of an episode
+        if self.hindsight_goal_selection_method == HindsightGoalSelectionMethod.Future:
+            relevant_base_transitions = last_episode_transitions[:-1]
+        else:
+            relevant_base_transitions = last_episode_transitions
+
+        # for each transition in the last episode, create a set of hindsight transitions
+        for transition_index, transition in enumerate(relevant_base_transitions):
+            sampled_goals = self._sample_goals(last_episode_transitions, transition_index)
+            for goal in sampled_goals:
+                hindsight_transition = copy.copy(transition)
+
+                if hindsight_transition.state['desired_goal'].shape != goal.shape:
+                    raise ValueError((
+                        'goal shape {goal_shape} already in transition is '
+                        'different than the one sampled as a hindsight goal '
+                        '{hindsight_goal_shape}.'
+                    ).format(
+                        goal_shape=hindsight_transition.state['desired_goal'].shape,
+                        hindsight_goal_shape=goal.shape,
+                    ))
+
+                # update the goal in the transition
+                hindsight_transition.state['desired_goal'] = goal
+                hindsight_transition.next_state['desired_goal'] = goal
+
+                # update the reward and terminal signal according to the goal
+                hindsight_transition.reward, hindsight_transition.game_over = \
+                    self.goals_space.get_reward_for_goal_and_state(goal, hindsight_transition.next_state)
+
+                hindsight_transition.total_return = None
+                episode.insert(hindsight_transition)
+
+        super().store_episode(episode)
+
+    def store(self, transition: Transition):
+        raise ValueError("An episodic HER cannot store a single transition. Only full episodes are to be stored.")
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Tuple
+
+from rl_coach.core_types import Episode, Transition
+from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \
+    EpisodicHindsightExperienceReplay, EpisodicHindsightExperienceReplayParameters
+from rl_coach.memories.non_episodic.experience_replay import MemoryGranularity
+from rl_coach.spaces import GoalsSpace
+
+
+class EpisodicHRLHindsightExperienceReplayParameters(EpisodicHindsightExperienceReplayParameters):
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def path(self):
+        return 'memories.episodic.episodic_hrl_hindsight_experience_replay:EpisodicHRLHindsightExperienceReplay'
+
+
+class EpisodicHRLHindsightExperienceReplay(EpisodicHindsightExperienceReplay):
+    """
+    Implements HRL Hindsight Experience Replay as described in the following paper:  https://arxiv.org/abs/1805.08180
+
+    This is the memory you should use if you want a shared hindsight experience replay buffer between multiple workers
+    """
+    def __init__(self, max_size: Tuple[MemoryGranularity, int],
+                 hindsight_transitions_per_regular_transition: int,
+                 hindsight_goal_selection_method: HindsightGoalSelectionMethod,
+                 goals_space: GoalsSpace,
+                 ):
+        """
+        :param max_size: The maximum size of the memory. should be defined in a granularity of Transitions
+        :param hindsight_transitions_per_regular_transition: The number of hindsight artificial transitions to generate
+                                                             for each actual transition
+        :param hindsight_goal_selection_method: The method that will be used for generating the goals for the
+                                                hindsight transitions. Should be one of HindsightGoalSelectionMethod
+        :param goals_space: A GoalsSpace  which defines the properties of the goals
+        :param do_action_hindsight: Replace the action (sub-goal) given to a lower layer, with the actual achieved goal
+        """
+        super().__init__(max_size, hindsight_transitions_per_regular_transition, hindsight_goal_selection_method,
+                         goals_space)
+
+    def store_episode(self, episode: Episode, lock: bool=True) -> None:
+        # for a layer producing sub-goals, we will replace in hindsight the action (sub-goal) given to the lower
+        # level with the actual achieved goal. the achieved goal (and observation) seen is assumed to be the same
+        # for all levels - we can use this level's achieved goal instead of the lower level's one
+        for transition in episode.transitions:
+            new_achieved_goal = transition.next_state[self.goals_space.goal_name]
+            transition.action = new_achieved_goal
+
+        super().store_episode(episode)
+
+    def store(self, transition: Transition):
+        raise ValueError("An episodic HER cannot store a single transition. Only full episodes are to be stored.")
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from rl_coach.memories.memory import MemoryGranularity, MemoryParameters
+
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
+
+
+class SingleEpisodeBufferParameters(MemoryParameters):
+    def __init__(self):
+        super().__init__()
+        del self.max_size
+
+    @property
+    def path(self):
+        return 'rl_coach.memories.episodic.single_episode_buffer:SingleEpisodeBuffer'
+
+
+class SingleEpisodeBuffer(EpisodicExperienceReplay):
+    def __init__(self):
+        super().__init__((MemoryGranularity.Episodes, 1))