Batch RL (#238)

2026-02-25 03:35:51 +01:00 · 2019-03-19 18:07:09 +02:00
parent 4a8451ff02
commit e3c7e526c7
38 changed files with 1003 additions and 87 deletions
--- a/rl_coach/memories/episodic/episodic_experience_replay.py
+++ b/rl_coach/memories/episodic/episodic_experience_replay.py
@@ -1,4 +1,5 @@
 #
+#
 # Copyright (c) 2017 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,14 +14,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import ast

-from typing import List, Tuple, Union, Dict, Any
-
+import pandas as pd
+from typing import List, Tuple, Union
 import numpy as np
+import random

 from rl_coach.core_types import Transition, Episode
+from rl_coach.logger import screen
 from rl_coach.memories.memory import Memory, MemoryGranularity, MemoryParameters
-from rl_coach.utils import ReaderWriterLock
+from rl_coach.utils import ReaderWriterLock, ProgressBar
+from rl_coach.core_types import CsvDataset


 class EpisodicExperienceReplayParameters(MemoryParameters):
@@ -28,6 +33,7 @@ class EpisodicExperienceReplayParameters(MemoryParameters):
        super().__init__()
        self.max_size = (MemoryGranularity.Transitions, 1000000)
        self.n_step = -1
+        self.train_to_eval_ratio = 1  # for OPE we'll want a value < 1

    @property
    def path(self):
@@ -40,7 +46,9 @@ class EpisodicExperienceReplay(Memory):
    calculations of total return and other values that depend on the sequential behavior of the transitions
    in the episode.
    """
-    def __init__(self, max_size: Tuple[MemoryGranularity, int]=(MemoryGranularity.Transitions, 1000000), n_step=-1):
+
+    def __init__(self, max_size: Tuple[MemoryGranularity, int] = (MemoryGranularity.Transitions, 1000000), n_step=-1,
+                 train_to_eval_ratio: int = 1):
        """
        :param max_size: the maximum number of transitions or episodes to hold in the memory
        """
@@ -52,8 +60,11 @@ class EpisodicExperienceReplay(Memory):
        self._num_transitions = 0
        self._num_transitions_in_complete_episodes = 0
        self.reader_writer_lock = ReaderWriterLock()
+        self.last_training_set_episode_id = None  # used in batch-rl
+        self.last_training_set_transition_id = None  # used in batch-rl
+        self.train_to_eval_ratio = train_to_eval_ratio  # used in batch-rl

-    def length(self, lock: bool=False) -> int:
+    def length(self, lock: bool = False) -> int:
        """
        Get the number of episodes in the ER (even if they are not complete)
        """
@@ -75,6 +86,9 @@ class EpisodicExperienceReplay(Memory):
    def num_transitions_in_complete_episodes(self):
        return self._num_transitions_in_complete_episodes

+    def get_last_training_set_episode_id(self):
+        return self.last_training_set_episode_id
+
    def sample(self, size: int, is_consecutive_transitions=False) -> List[Transition]:
        """
        Sample a batch of transitions from the replay buffer. If the requested size is larger than the number
@@ -92,7 +106,7 @@ class EpisodicExperienceReplay(Memory):
                    batch = self._buffer[episode_idx].transitions
                else:
                    transition_idx = np.random.randint(size, self._buffer[episode_idx].length())
-                    batch = self._buffer[episode_idx].transitions[transition_idx-size:transition_idx]
+                    batch = self._buffer[episode_idx].transitions[transition_idx - size:transition_idx]
            else:
                transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
                batch = [self.transitions[i] for i in transitions_idx]
@@ -105,6 +119,79 @@ class EpisodicExperienceReplay(Memory):

        return batch

+    def get_episode_for_transition(self, transition: Transition) -> (int, Episode):
+        """
+        Get the episode from which that transition came from.
+        :param transition: The transition to lookup the episode for
+        :return: (Episode number, the episode) or (-1, None) if could not find a matching episode.
+        """
+
+        for i, episode in enumerate(self._buffer):
+            if transition in episode.transitions:
+                return i, episode
+        return -1, None
+
+    def shuffle_episodes(self):
+        """
+        Shuffle all the episodes in the replay buffer
+        :return:
+        """
+        random.shuffle(self._buffer)
+        self.transitions = [t for e in self._buffer for t in e.transitions]
+
+    def get_shuffled_data_generator(self, size: int) -> List[Transition]:
+        """
+        Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
+        If the requested size is larger than the number of samples available in the replay buffer then the batch will
+        return empty. The last returned batch may be smaller than the size requested, to accommodate for all the
+        transitions in the replay buffer.
+
+        :param size: the size of the batch to return
+        :return: a batch (list) of selected transitions from the replay buffer
+        """
+        self.reader_writer_lock.lock_writing()
+        if self.last_training_set_transition_id is None:
+            if self.train_to_eval_ratio < 0 or self.train_to_eval_ratio >= 1:
+                raise ValueError('train_to_eval_ratio should be in the (0, 1] range.')
+
+            transition = self.transitions[round(self.train_to_eval_ratio * self.num_transitions_in_complete_episodes())]
+            episode_num, episode = self.get_episode_for_transition(transition)
+            self.last_training_set_episode_id = episode_num
+            self.last_training_set_transition_id = \
+                len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])
+
+        shuffled_transition_indices = list(range(self.last_training_set_transition_id))
+        random.shuffle(shuffled_transition_indices)
+
+        # we deliberately drop some of the ending data which is left after dividing to batches of size `size`
+        # for i in range(math.ceil(len(shuffled_transition_indices) / size)):
+        for i in range(int(len(shuffled_transition_indices) / size)):
+            sample_data = [self.transitions[j] for j in shuffled_transition_indices[i * size: (i + 1) * size]]
+            self.reader_writer_lock.release_writing()
+
+            yield sample_data
+
+    def get_all_complete_episodes_transitions(self) -> List[Transition]:
+        """
+        Get all the transitions from all the complete episodes in the buffer
+        :return: a list of transitions
+        """
+        return self.transitions[:self.num_transitions_in_complete_episodes()]
+
+    def get_all_complete_episodes(self) -> List[Episode]:
+        """
+        Get all the transitions from all the complete episodes in the buffer
+        :return: a list of transitions
+        """
+        return self.get_all_complete_episodes_from_to(0, self.num_complete_episodes())
+
+    def get_all_complete_episodes_from_to(self, start_episode_id, end_episode_id) -> List[Episode]:
+        """
+        Get all the transitions from all the complete episodes in the buffer matching the given episode range
+        :return: a list of transitions
+        """
+        return self._buffer[start_episode_id:end_episode_id]
+
    def _enforce_max_length(self) -> None:
        """
        Make sure that the size of the replay buffer does not pass the maximum size allowed.
@@ -188,7 +275,7 @@ class EpisodicExperienceReplay(Memory):

        self.reader_writer_lock.release_writing_and_reading()

-    def store_episode(self, episode: Episode, lock: bool=True) -> None:
+    def store_episode(self, episode: Episode, lock: bool = True) -> None:
        """
        Store a new episode in the memory.
        :param episode: the new episode to store
@@ -211,7 +298,7 @@ class EpisodicExperienceReplay(Memory):
        if lock:
            self.reader_writer_lock.release_writing_and_reading()

-    def get_episode(self, episode_index: int, lock: bool=True) -> Union[None, Episode]:
+    def get_episode(self, episode_index: int, lock: bool = True) -> Union[None, Episode]:
        """
        Returns the episode in the given index. If the episode does not exist, returns None instead.
        :param episode_index: the index of the episode to return
@@ -256,7 +343,7 @@ class EpisodicExperienceReplay(Memory):
        self.reader_writer_lock.release_writing_and_reading()

    # for API compatibility
-    def get(self, episode_index: int, lock: bool=True) -> Union[None, Episode]:
+    def get(self, episode_index: int, lock: bool = True) -> Union[None, Episode]:
        """
        Returns the episode in the given index. If the episode does not exist, returns None instead.
        :param episode_index: the index of the episode to return
@@ -315,3 +402,47 @@ class EpisodicExperienceReplay(Memory):

        self.reader_writer_lock.release_writing()
        return mean
+
+    def load_csv(self, csv_dataset: CsvDataset) -> None:
+        """
+        Restore the replay buffer contents from a csv file.
+        The csv file is assumed to include a list of transitions.
+        :param csv_dataset: A construct which holds the dataset parameters
+        """
+        df = pd.read_csv(csv_dataset.filepath)
+        if len(df) > self.max_size[1]:
+            screen.warning("Warning! The number of transitions to load into the replay buffer ({}) is "
+                           "bigger than the max size of the replay buffer ({}). The excessive transitions will "
+                           "not be stored.".format(len(df), self.max_size[1]))
+
+        episode_ids = df['episode_id'].unique()
+        progress_bar = ProgressBar(len(episode_ids))
+        state_columns = [col for col in df.columns if col.startswith('state_feature')]
+
+        for e_id in episode_ids:
+            progress_bar.update(e_id)
+            df_episode_transitions = df[df['episode_id'] == e_id]
+            episode = Episode()
+            for (_, current_transition), (_, next_transition) in zip(df_episode_transitions[:-1].iterrows(),
+                                                                     df_episode_transitions[1:].iterrows()):
+                state = np.array([current_transition[col] for col in state_columns])
+                next_state = np.array([next_transition[col] for col in state_columns])
+
+                episode.insert(
+                    Transition(state={'observation': state},
+                               action=current_transition['action'], reward=current_transition['reward'],
+                               next_state={'observation': next_state}, game_over=False,
+                               info={'all_action_probabilities':
+                                         ast.literal_eval(current_transition['all_action_probabilities'])}))
+
+            # Set the last transition to end the episode
+            if csv_dataset.is_episodic:
+                episode.get_last_transition().game_over = True
+
+            self.store_episode(episode)
+
+        # close the progress bar
+        progress_bar.update(len(episode_ids))
+        progress_bar.close()
+
+        self.shuffle_episodes()
--- a/rl_coach/memories/non_episodic/experience_replay.py
+++ b/rl_coach/memories/non_episodic/experience_replay.py
@@ -14,10 +14,10 @@
 # limitations under the License.
 #

-from typing import List, Tuple, Union, Dict, Any
+from typing import List, Tuple, Union
 import pickle
-import sys
-import time
+import random
+import math

 import numpy as np

@@ -72,7 +72,6 @@ class ExperienceReplay(Memory):
        Sample a batch of transitions form the replay buffer. If the requested size is larger than the number
        of samples available in the replay buffer then the batch will return empty.
        :param size: the size of the batch to sample
-        :param beta: the beta parameter used for importance sampling
        :return: a batch (list) of selected transitions from the replay buffer
        """
        self.reader_writer_lock.lock_writing()
@@ -92,6 +91,32 @@ class ExperienceReplay(Memory):
        self.reader_writer_lock.release_writing()
        return batch

+    def get_shuffled_data_generator(self, size: int) -> List[Transition]:
+        """
+        Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
+        If the requested size is larger than the number of samples available in the replay buffer then the batch will
+        return empty. The last returned batch may be smaller than the size requested, to accommodate for all the
+        transitions in the replay buffer.
+
+        :param size: the size of the batch to return
+        :return: a batch (list) of selected transitions from the replay buffer
+        """
+        self.reader_writer_lock.lock_writing()
+        shuffled_transition_indices = list(range(len(self.transitions)))
+        random.shuffle(shuffled_transition_indices)
+
+        # we deliberately drop some of the ending data which is left after dividing to batches of size `size`
+        # for i in range(math.ceil(len(shuffled_transition_indices) / size)):
+        for i in range(int(len(shuffled_transition_indices) / size)):
+            sample_data = [self.transitions[j] for j in shuffled_transition_indices[i * size: (i + 1) * size]]
+            self.reader_writer_lock.release_writing()
+
+            yield sample_data
+
+        ## usage example
+        # for o in random_seq_generator(list(range(10)), 4):
+        #     print(o)
+
    def _enforce_max_length(self) -> None:
        """
        Make sure that the size of the replay buffer does not pass the maximum size allowed.
@@ -215,7 +240,7 @@ class ExperienceReplay(Memory):
        with open(file_path, 'wb') as file:
            pickle.dump(self.transitions, file)

-    def load(self, file_path: str) -> None:
+    def load_pickled(self, file_path: str) -> None:
        """
        Restore the replay buffer contents from a pickle file.
        The pickle file is assumed to include a list of transitions.
@@ -238,3 +263,4 @@ class ExperienceReplay(Memory):
                    progress_bar.update(transition_idx)

            progress_bar.close()
+