1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00

bug fixes for OPE (#311)

This commit is contained in:
Gal Leibovich
2019-05-21 16:39:11 +03:00
committed by GitHub
parent 85d70dd7d5
commit acceb03ac0
8 changed files with 38 additions and 21 deletions

View File

@@ -697,7 +697,7 @@ class Agent(AgentInterface):
# we either go sequentially through the entire replay buffer in the batch RL mode,
# or sample randomly for the basic RL case.
training_schedule = self.call_memory('get_shuffled_data_generator', batch_size) if \
training_schedule = self.call_memory('get_shuffled_training_data_generator', batch_size) if \
self.ap.is_batch_rl_training else [self.call_memory('sample', batch_size) for _ in
range(self.ap.algorithm.num_consecutive_training_steps)]

View File

@@ -155,7 +155,7 @@ class DDQNBCQAgent(DQNAgent):
reward_model_loss = 0
imitation_model_loss = 0
total_transitions_processed = 0
for i, batch in enumerate(self.call_memory('get_shuffled_data_generator', batch_size)):
for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)):
batch = Batch(batch)
# reward model

View File

@@ -164,7 +164,7 @@ class ValueOptimizationAgent(Agent):
for epoch in range(epochs):
loss = 0
total_transitions_processed = 0
for i, batch in enumerate(self.call_memory('get_shuffled_data_generator', batch_size)):
for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)):
batch = Batch(batch)
loss += self.get_reward_model_loss(batch)
total_transitions_processed += batch.size

View File

@@ -173,12 +173,12 @@ class BatchRLGraphManager(BasicRLGraphManager):
"""
agent = self.level_managers[0].agents['agent']
screen.log_title("Training a regression model for estimating MDP rewards")
agent.improve_reward_model(epochs=self.reward_model_num_epochs)
# prepare dataset to be consumed in the expected formats for OPE
agent.memory.prepare_evaluation_dataset()
screen.log_title("Training a regression model for estimating MDP rewards")
agent.improve_reward_model(epochs=self.reward_model_num_epochs)
screen.log_title("Collecting static statistics for OPE")
agent.ope_manager.gather_static_shared_stats(evaluation_dataset_as_transitions=
agent.memory.evaluation_dataset_as_transitions,

View File

@@ -148,7 +148,7 @@ class EpisodicExperienceReplay(Memory):
random.shuffle(self._buffer)
self.transitions = [t for e in self._buffer for t in e.transitions]
def get_shuffled_data_generator(self, size: int) -> List[Transition]:
def get_shuffled_training_data_generator(self, size: int) -> List[Transition]:
"""
Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
If the requested size is larger than the number of samples available in the replay buffer then the batch will
@@ -159,15 +159,6 @@ class EpisodicExperienceReplay(Memory):
:return: a batch (list) of selected transitions from the replay buffer
"""
self.reader_writer_lock.lock_writing()
if self.last_training_set_transition_id is None:
if self.train_to_eval_ratio < 0 or self.train_to_eval_ratio >= 1:
raise ValueError('train_to_eval_ratio should be in the (0, 1] range.')
transition = self.transitions[round(self.train_to_eval_ratio * self.num_transitions_in_complete_episodes())]
episode_num, episode = self.get_episode_for_transition(transition)
self.last_training_set_episode_id = episode_num
self.last_training_set_transition_id = \
len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])
shuffled_transition_indices = list(range(self.last_training_set_transition_id))
random.shuffle(shuffled_transition_indices)
@@ -483,6 +474,9 @@ class EpisodicExperienceReplay(Memory):
Gather the memory content that will be used for off-policy evaluation in episodes and transitions format
:return:
"""
self.reader_writer_lock.lock_writing_and_reading()
self._split_training_and_evaluation_datasets()
self.evaluation_dataset_as_episodes = deepcopy(
self.get_all_complete_episodes_from_to(self.get_last_training_set_episode_id() + 1,
self.num_complete_episodes()))
@@ -493,3 +487,20 @@ class EpisodicExperienceReplay(Memory):
self.evaluation_dataset_as_transitions = [t for e in self.evaluation_dataset_as_episodes
for t in e.transitions]
self.reader_writer_lock.release_writing_and_reading()
def _split_training_and_evaluation_datasets(self):
"""
If the data in the buffer was not split to training and evaluation yet, split it accordingly.
:return: None
"""
if self.last_training_set_transition_id is None:
if self.train_to_eval_ratio < 0 or self.train_to_eval_ratio >= 1:
raise ValueError('train_to_eval_ratio should be in the (0, 1] range.')
transition = self.transitions[round(self.train_to_eval_ratio * self.num_transitions_in_complete_episodes())]
episode_num, episode = self.get_episode_for_transition(transition)
self.last_training_set_episode_id = episode_num
self.last_training_set_transition_id = \
len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])

View File

@@ -92,7 +92,7 @@ class ExperienceReplay(Memory):
self.reader_writer_lock.release_writing()
return batch
def get_shuffled_data_generator(self, size: int) -> List[Transition]:
def get_shuffled_training_data_generator(self, size: int) -> List[Transition]:
"""
Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
If the requested size is larger than the number of samples available in the replay buffer then the batch will

View File

@@ -26,7 +26,9 @@ class SequentialDoublyRobust(object):
"""
Run the off-policy evaluator to get a score for the goodness of the new policy, based on the dataset,
which was collected using other policy(ies).
When the epsiodes are of changing lengths, this estimator might prove problematic due to its nature of recursion
of adding rewards up to the end of the episode (horizon). It will probably work best with episodes of fixed
length.
Paper: https://arxiv.org/pdf/1511.03722.pdf
:return: the evaluation score
@@ -37,7 +39,7 @@ class SequentialDoublyRobust(object):
for episode in evaluation_dataset_as_episodes:
episode_seq_dr = 0
for transition in episode.transitions:
for transition in reversed(episode.transitions):
rho = transition.info['softmax_policy_prob'][transition.action] / \
transition.info['all_action_probabilities'][transition.action]
episode_seq_dr = transition.info['v_value_q_model_based'] + rho * (transition.reward + discount_factor

View File

@@ -46,8 +46,12 @@ class WeightedImportanceSampling(object):
per_episode_w_i.append(w_i)
total_w_i_sum_across_episodes = sum(per_episode_w_i)
wis = 0
for i, episode in enumerate(evaluation_dataset_as_episodes):
wis += per_episode_w_i[i]/total_w_i_sum_across_episodes * episode.transitions[0].n_step_discounted_rewards
if total_w_i_sum_across_episodes != 0:
for i, episode in enumerate(evaluation_dataset_as_episodes):
if len(episode.transitions) != 0:
wis += per_episode_w_i[i] * episode.transitions[0].n_step_discounted_rewards
wis /= total_w_i_sum_across_episodes
return wis