From 5674749ed523e5214e758c24bc8a0fd62259680c Mon Sep 17 00:00:00 2001
From: Gal Leibovich <gal.leibovich@intel.com>
Date: Mon, 26 Nov 2018 00:08:43 +0200
Subject: [PATCH] workaround for resolving the issue of restoring a multi-node
 training checkpoint to single worker (#156)

---
 rl_coach/graph_managers/graph_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rl_coach/graph_managers/graph_manager.py b/rl_coach/graph_managers/graph_manager.py
index 9e23610..d13a59b 100644
--- a/rl_coach/graph_managers/graph_manager.py
+++ b/rl_coach/graph_managers/graph_manager.py
@@ -546,7 +546,8 @@ class GraphManager(object):
 
         # TODO: find better way to load checkpoints that were saved with a global network into the online network
         if self.task_parameters.checkpoint_restore_dir:
-            if self.task_parameters.framework_type == Frameworks.tensorflow:
+            if self.task_parameters.framework_type == Frameworks.tensorflow and\
+                    'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_dir):
                 # TODO-fixme checkpointing
                 # MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so,
                 # it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" filename