bug fixes for OPE (#311)

2026-07-09 02:46:33 +02:00 · 2019-05-21 16:39:11 +03:00
parent 85d70dd7d5
commit acceb03ac0
8 changed files with 38 additions and 21 deletions
@@ -26,7 +26,9 @@ class SequentialDoublyRobust(object):
        """
        Run the off-policy evaluator to get a score for the goodness of the new policy, based on the dataset,
        which was collected using other policy(ies).
-
+        When the epsiodes are of changing lengths, this estimator might prove problematic due to its nature of recursion
+        of adding rewards up to the end of the episode (horizon). It will probably work best with episodes of fixed
+        length.
        Paper: https://arxiv.org/pdf/1511.03722.pdf

        :return: the evaluation score
@@ -37,7 +39,7 @@ class SequentialDoublyRobust(object):

        for episode in evaluation_dataset_as_episodes:
            episode_seq_dr = 0
-            for transition in episode.transitions:
+            for transition in reversed(episode.transitions):
                rho = transition.info['softmax_policy_prob'][transition.action] / \
                      transition.info['all_action_probabilities'][transition.action]
                episode_seq_dr = transition.info['v_value_q_model_based'] + rho * (transition.reward + discount_factor
@@ -46,8 +46,12 @@ class WeightedImportanceSampling(object):
            per_episode_w_i.append(w_i)

        total_w_i_sum_across_episodes = sum(per_episode_w_i)
+
        wis = 0
-        for i, episode in enumerate(evaluation_dataset_as_episodes):
-            wis += per_episode_w_i[i]/total_w_i_sum_across_episodes * episode.transitions[0].n_step_discounted_rewards
+        if total_w_i_sum_across_episodes != 0:
+            for i, episode in enumerate(evaluation_dataset_as_episodes):
+                if len(episode.transitions) != 0:
+                    wis += per_episode_w_i[i] * episode.transitions[0].n_step_discounted_rewards
+            wis /= total_w_i_sum_across_episodes

        return wis