1
0
mirror of https://github.com/gryf/coach.git synced 2026-03-13 04:55:47 +01:00

bug fixes for OPE (#311)

This commit is contained in:
Gal Leibovich
2019-05-21 16:39:11 +03:00
committed by GitHub
parent 85d70dd7d5
commit acceb03ac0
8 changed files with 38 additions and 21 deletions

View File

@@ -26,7 +26,9 @@ class SequentialDoublyRobust(object):
"""
Run the off-policy evaluator to get a score for the goodness of the new policy, based on the dataset,
which was collected using other policy(ies).
When the epsiodes are of changing lengths, this estimator might prove problematic due to its nature of recursion
of adding rewards up to the end of the episode (horizon). It will probably work best with episodes of fixed
length.
Paper: https://arxiv.org/pdf/1511.03722.pdf
:return: the evaluation score
@@ -37,7 +39,7 @@ class SequentialDoublyRobust(object):
for episode in evaluation_dataset_as_episodes:
episode_seq_dr = 0
for transition in episode.transitions:
for transition in reversed(episode.transitions):
rho = transition.info['softmax_policy_prob'][transition.action] / \
transition.info['all_action_probabilities'][transition.action]
episode_seq_dr = transition.info['v_value_q_model_based'] + rho * (transition.reward + discount_factor

View File

@@ -46,8 +46,12 @@ class WeightedImportanceSampling(object):
per_episode_w_i.append(w_i)
total_w_i_sum_across_episodes = sum(per_episode_w_i)
wis = 0
for i, episode in enumerate(evaluation_dataset_as_episodes):
wis += per_episode_w_i[i]/total_w_i_sum_across_episodes * episode.transitions[0].n_step_discounted_rewards
if total_w_i_sum_across_episodes != 0:
for i, episode in enumerate(evaluation_dataset_as_episodes):
if len(episode.transitions) != 0:
wis += per_episode_w_i[i] * episode.transitions[0].n_step_discounted_rewards
wis /= total_w_i_sum_across_episodes
return wis