mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 11:10:20 +01:00
bug fixes for OPE (#311)
This commit is contained in:
@@ -697,7 +697,7 @@ class Agent(AgentInterface):
|
||||
|
||||
# we either go sequentially through the entire replay buffer in the batch RL mode,
|
||||
# or sample randomly for the basic RL case.
|
||||
training_schedule = self.call_memory('get_shuffled_data_generator', batch_size) if \
|
||||
training_schedule = self.call_memory('get_shuffled_training_data_generator', batch_size) if \
|
||||
self.ap.is_batch_rl_training else [self.call_memory('sample', batch_size) for _ in
|
||||
range(self.ap.algorithm.num_consecutive_training_steps)]
|
||||
|
||||
|
||||
@@ -155,7 +155,7 @@ class DDQNBCQAgent(DQNAgent):
|
||||
reward_model_loss = 0
|
||||
imitation_model_loss = 0
|
||||
total_transitions_processed = 0
|
||||
for i, batch in enumerate(self.call_memory('get_shuffled_data_generator', batch_size)):
|
||||
for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)):
|
||||
batch = Batch(batch)
|
||||
|
||||
# reward model
|
||||
|
||||
@@ -164,7 +164,7 @@ class ValueOptimizationAgent(Agent):
|
||||
for epoch in range(epochs):
|
||||
loss = 0
|
||||
total_transitions_processed = 0
|
||||
for i, batch in enumerate(self.call_memory('get_shuffled_data_generator', batch_size)):
|
||||
for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)):
|
||||
batch = Batch(batch)
|
||||
loss += self.get_reward_model_loss(batch)
|
||||
total_transitions_processed += batch.size
|
||||
|
||||
@@ -173,12 +173,12 @@ class BatchRLGraphManager(BasicRLGraphManager):
|
||||
"""
|
||||
agent = self.level_managers[0].agents['agent']
|
||||
|
||||
screen.log_title("Training a regression model for estimating MDP rewards")
|
||||
agent.improve_reward_model(epochs=self.reward_model_num_epochs)
|
||||
|
||||
# prepare dataset to be consumed in the expected formats for OPE
|
||||
agent.memory.prepare_evaluation_dataset()
|
||||
|
||||
screen.log_title("Training a regression model for estimating MDP rewards")
|
||||
agent.improve_reward_model(epochs=self.reward_model_num_epochs)
|
||||
|
||||
screen.log_title("Collecting static statistics for OPE")
|
||||
agent.ope_manager.gather_static_shared_stats(evaluation_dataset_as_transitions=
|
||||
agent.memory.evaluation_dataset_as_transitions,
|
||||
|
||||
@@ -148,7 +148,7 @@ class EpisodicExperienceReplay(Memory):
|
||||
random.shuffle(self._buffer)
|
||||
self.transitions = [t for e in self._buffer for t in e.transitions]
|
||||
|
||||
def get_shuffled_data_generator(self, size: int) -> List[Transition]:
|
||||
def get_shuffled_training_data_generator(self, size: int) -> List[Transition]:
|
||||
"""
|
||||
Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
|
||||
If the requested size is larger than the number of samples available in the replay buffer then the batch will
|
||||
@@ -159,15 +159,6 @@ class EpisodicExperienceReplay(Memory):
|
||||
:return: a batch (list) of selected transitions from the replay buffer
|
||||
"""
|
||||
self.reader_writer_lock.lock_writing()
|
||||
if self.last_training_set_transition_id is None:
|
||||
if self.train_to_eval_ratio < 0 or self.train_to_eval_ratio >= 1:
|
||||
raise ValueError('train_to_eval_ratio should be in the (0, 1] range.')
|
||||
|
||||
transition = self.transitions[round(self.train_to_eval_ratio * self.num_transitions_in_complete_episodes())]
|
||||
episode_num, episode = self.get_episode_for_transition(transition)
|
||||
self.last_training_set_episode_id = episode_num
|
||||
self.last_training_set_transition_id = \
|
||||
len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])
|
||||
|
||||
shuffled_transition_indices = list(range(self.last_training_set_transition_id))
|
||||
random.shuffle(shuffled_transition_indices)
|
||||
@@ -483,6 +474,9 @@ class EpisodicExperienceReplay(Memory):
|
||||
Gather the memory content that will be used for off-policy evaluation in episodes and transitions format
|
||||
:return:
|
||||
"""
|
||||
self.reader_writer_lock.lock_writing_and_reading()
|
||||
|
||||
self._split_training_and_evaluation_datasets()
|
||||
self.evaluation_dataset_as_episodes = deepcopy(
|
||||
self.get_all_complete_episodes_from_to(self.get_last_training_set_episode_id() + 1,
|
||||
self.num_complete_episodes()))
|
||||
@@ -493,3 +487,20 @@ class EpisodicExperienceReplay(Memory):
|
||||
|
||||
self.evaluation_dataset_as_transitions = [t for e in self.evaluation_dataset_as_episodes
|
||||
for t in e.transitions]
|
||||
self.reader_writer_lock.release_writing_and_reading()
|
||||
|
||||
def _split_training_and_evaluation_datasets(self):
|
||||
"""
|
||||
If the data in the buffer was not split to training and evaluation yet, split it accordingly.
|
||||
:return: None
|
||||
"""
|
||||
|
||||
if self.last_training_set_transition_id is None:
|
||||
if self.train_to_eval_ratio < 0 or self.train_to_eval_ratio >= 1:
|
||||
raise ValueError('train_to_eval_ratio should be in the (0, 1] range.')
|
||||
|
||||
transition = self.transitions[round(self.train_to_eval_ratio * self.num_transitions_in_complete_episodes())]
|
||||
episode_num, episode = self.get_episode_for_transition(transition)
|
||||
self.last_training_set_episode_id = episode_num
|
||||
self.last_training_set_transition_id = \
|
||||
len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])
|
||||
|
||||
@@ -92,7 +92,7 @@ class ExperienceReplay(Memory):
|
||||
self.reader_writer_lock.release_writing()
|
||||
return batch
|
||||
|
||||
def get_shuffled_data_generator(self, size: int) -> List[Transition]:
|
||||
def get_shuffled_training_data_generator(self, size: int) -> List[Transition]:
|
||||
"""
|
||||
Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
|
||||
If the requested size is larger than the number of samples available in the replay buffer then the batch will
|
||||
|
||||
@@ -26,7 +26,9 @@ class SequentialDoublyRobust(object):
|
||||
"""
|
||||
Run the off-policy evaluator to get a score for the goodness of the new policy, based on the dataset,
|
||||
which was collected using other policy(ies).
|
||||
|
||||
When the epsiodes are of changing lengths, this estimator might prove problematic due to its nature of recursion
|
||||
of adding rewards up to the end of the episode (horizon). It will probably work best with episodes of fixed
|
||||
length.
|
||||
Paper: https://arxiv.org/pdf/1511.03722.pdf
|
||||
|
||||
:return: the evaluation score
|
||||
@@ -37,7 +39,7 @@ class SequentialDoublyRobust(object):
|
||||
|
||||
for episode in evaluation_dataset_as_episodes:
|
||||
episode_seq_dr = 0
|
||||
for transition in episode.transitions:
|
||||
for transition in reversed(episode.transitions):
|
||||
rho = transition.info['softmax_policy_prob'][transition.action] / \
|
||||
transition.info['all_action_probabilities'][transition.action]
|
||||
episode_seq_dr = transition.info['v_value_q_model_based'] + rho * (transition.reward + discount_factor
|
||||
|
||||
@@ -46,8 +46,12 @@ class WeightedImportanceSampling(object):
|
||||
per_episode_w_i.append(w_i)
|
||||
|
||||
total_w_i_sum_across_episodes = sum(per_episode_w_i)
|
||||
|
||||
wis = 0
|
||||
for i, episode in enumerate(evaluation_dataset_as_episodes):
|
||||
wis += per_episode_w_i[i]/total_w_i_sum_across_episodes * episode.transitions[0].n_step_discounted_rewards
|
||||
if total_w_i_sum_across_episodes != 0:
|
||||
for i, episode in enumerate(evaluation_dataset_as_episodes):
|
||||
if len(episode.transitions) != 0:
|
||||
wis += per_episode_w_i[i] * episode.transitions[0].n_step_discounted_rewards
|
||||
wis /= total_w_i_sum_across_episodes
|
||||
|
||||
return wis
|
||||
|
||||
Reference in New Issue
Block a user