mirror of
https://github.com/gryf/coach.git
synced 2026-03-13 04:55:47 +01:00
bug fixes for OPE (#311)
This commit is contained in:
@@ -26,7 +26,9 @@ class SequentialDoublyRobust(object):
|
||||
"""
|
||||
Run the off-policy evaluator to get a score for the goodness of the new policy, based on the dataset,
|
||||
which was collected using other policy(ies).
|
||||
|
||||
When the epsiodes are of changing lengths, this estimator might prove problematic due to its nature of recursion
|
||||
of adding rewards up to the end of the episode (horizon). It will probably work best with episodes of fixed
|
||||
length.
|
||||
Paper: https://arxiv.org/pdf/1511.03722.pdf
|
||||
|
||||
:return: the evaluation score
|
||||
@@ -37,7 +39,7 @@ class SequentialDoublyRobust(object):
|
||||
|
||||
for episode in evaluation_dataset_as_episodes:
|
||||
episode_seq_dr = 0
|
||||
for transition in episode.transitions:
|
||||
for transition in reversed(episode.transitions):
|
||||
rho = transition.info['softmax_policy_prob'][transition.action] / \
|
||||
transition.info['all_action_probabilities'][transition.action]
|
||||
episode_seq_dr = transition.info['v_value_q_model_based'] + rho * (transition.reward + discount_factor
|
||||
|
||||
@@ -46,8 +46,12 @@ class WeightedImportanceSampling(object):
|
||||
per_episode_w_i.append(w_i)
|
||||
|
||||
total_w_i_sum_across_episodes = sum(per_episode_w_i)
|
||||
|
||||
wis = 0
|
||||
for i, episode in enumerate(evaluation_dataset_as_episodes):
|
||||
wis += per_episode_w_i[i]/total_w_i_sum_across_episodes * episode.transitions[0].n_step_discounted_rewards
|
||||
if total_w_i_sum_across_episodes != 0:
|
||||
for i, episode in enumerate(evaluation_dataset_as_episodes):
|
||||
if len(episode.transitions) != 0:
|
||||
wis += per_episode_w_i[i] * episode.transitions[0].n_step_discounted_rewards
|
||||
wis /= total_w_i_sum_across_episodes
|
||||
|
||||
return wis
|
||||
|
||||
Reference in New Issue
Block a user