From 10220be9befb142853ac44b4c06445c1007747c7 Mon Sep 17 00:00:00 2001
From: Gal Novik <gal.novik@intel.com>
Date: Sun, 3 Mar 2019 10:03:45 +0200
Subject: [PATCH] Adding support for evaluation only mode with predefined
 number of steps (#225)

---
 rl_coach/agents/agent.py    | 12 ++++++------
 rl_coach/base_parameters.py | 10 ++++++----
 rl_coach/coach.py           | 18 ++++++++++++------
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/rl_coach/agents/agent.py b/rl_coach/agents/agent.py
index dd0f3da..e7706c7 100644
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -397,8 +397,7 @@ class Agent(AgentInterface):
             success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
             self.agent_logger.create_signal_value(
                 "Success Rate",
-                success_rate
-            )
+                success_rate)
             if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
                 screen.log_title("{}: Finished evaluation phase. Success rate = {}, Avg Total Reward = {}"
                                  .format(self.name, np.round(success_rate, 2), np.round(evaluation_reward, 2)))
@@ -488,10 +487,11 @@ class Agent(AgentInterface):
         self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
         self.agent_logger.update_wall_clock_time(self.current_episode)
 
-        if self._phase != RunPhase.TEST:
-            self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
-            self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
-            self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
+        # The following signals are created with meaningful values only when an evaluation phase is completed.
+        # Creating with default NaNs for any HEATUP/TRAIN/TEST episode which is not the last in an evaluation phase
+        self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
+        self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
+        self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
 
         for signal in self.episode_signals:
             self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
diff --git a/rl_coach/base_parameters.py b/rl_coach/base_parameters.py
index da368c3..3c03de8 100644
--- a/rl_coach/base_parameters.py
+++ b/rl_coach/base_parameters.py
@@ -550,13 +550,14 @@ class AgentParameters(Parameters):
 
 
 class TaskParameters(Parameters):
-    def __init__(self, framework_type: Frameworks=Frameworks.tensorflow, evaluate_only: bool=False, use_cpu: bool=False,
+    def __init__(self, framework_type: Frameworks=Frameworks.tensorflow, evaluate_only: int=None, use_cpu: bool=False,
                  experiment_path='/tmp', seed=None, checkpoint_save_secs=None, checkpoint_restore_dir=None,
                  checkpoint_save_dir=None, export_onnx_graph: bool=False, apply_stop_condition: bool=False,
                  num_gpu: int=1):
         """
         :param framework_type: deep learning framework type. currently only tensorflow is supported
-        :param evaluate_only: the task will be used only for evaluating the model
+        :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
+                                A value of 0 means that task will be evaluated for an infinite number of steps.
         :param use_cpu: use the cpu for this task
         :param experiment_path: the path to the directory which will store all the experiment outputs
         :param seed: a seed to use for the random numbers generator
@@ -583,13 +584,14 @@ class TaskParameters(Parameters):
 
 class DistributedTaskParameters(TaskParameters):
     def __init__(self, framework_type: Frameworks, parameters_server_hosts: str, worker_hosts: str, job_type: str,
-                 task_index: int, evaluate_only: bool=False, num_tasks: int=None,
+                 task_index: int, evaluate_only: int=None, num_tasks: int=None,
                  num_training_tasks: int=None, use_cpu: bool=False, experiment_path=None, dnd=None,
                  shared_memory_scratchpad=None, seed=None, checkpoint_save_secs=None, checkpoint_restore_dir=None,
                  checkpoint_save_dir=None, export_onnx_graph: bool=False, apply_stop_condition: bool=False):
         """
         :param framework_type: deep learning framework type. currently only tensorflow is supported
-        :param evaluate_only: the task will be used only for evaluating the model
+        :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
+                                A value of 0 means that task will be evaluated for an infinite number of steps.
         :param parameters_server_hosts: comma-separated list of hostname:port pairs to which the parameter servers are
                                         assigned
         :param worker_hosts: comma-separated list of hostname:port pairs to which the workers are assigned
diff --git a/rl_coach/coach.py b/rl_coach/coach.py
index 944bd16..304cf83 100644
--- a/rl_coach/coach.py
+++ b/rl_coach/coach.py
@@ -76,8 +76,10 @@ def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'
     graph_manager.create_graph(task_parameters)
 
     # let the adventure begin
-    if task_parameters.evaluate_only:
-        graph_manager.evaluate(EnvironmentSteps(sys.maxsize))
+    if task_parameters.evaluate_only is not None:
+        steps_to_evaluate = task_parameters.evaluate_only if task_parameters.evaluate_only > 0 \
+            else sys.maxsize
+        graph_manager.evaluate(EnvironmentSteps(steps_to_evaluate))
     else:
         graph_manager.improve()
     graph_manager.close()
@@ -466,9 +468,13 @@ class CoachLauncher(object):
                                  "This option will save a replay buffer with the game play.",
                             action='store_true')
         parser.add_argument('--evaluate',
-                            help="(flag) Run evaluation only. This is a convenient way to disable "
-                                 "training in order to evaluate an existing checkpoint.",
-                            action='store_true')
+                            help="(int) Run evaluation only, for at least the given number of steps (note that complete "
+                                "episodes are evaluated). This is a convenient way to disable training in order "
+                                "to evaluate an existing checkpoint. If value is 0, or no value is provided, "
+                                "evaluation will run for an infinite number of steps.",
+                            nargs='?',
+                            const=0,
+                            type=int)
         parser.add_argument('-v', '--verbosity',
                             help="(flag) Sets the verbosity level of Coach print outs. Can be either low or high.",
                             default="low",
@@ -659,7 +665,7 @@ class CoachLauncher(object):
                 worker_hosts=worker_hosts,
                 job_type=job_type,
                 task_index=task_index,
-                evaluate_only=evaluation_worker,
+                evaluate_only=0 if evaluation_worker else None, # 0 value for evaluation worker as it should run infinitely
                 use_cpu=args.use_cpu,
                 num_tasks=total_tasks,  # training tasks + 1 evaluation task
                 num_training_tasks=args.num_workers,