From 5674749ed523e5214e758c24bc8a0fd62259680c Mon Sep 17 00:00:00 2001 From: Gal Leibovich Date: Mon, 26 Nov 2018 00:08:43 +0200 Subject: [PATCH] workaround for resolving the issue of restoring a multi-node training checkpoint to single worker (#156) --- rl_coach/graph_managers/graph_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rl_coach/graph_managers/graph_manager.py b/rl_coach/graph_managers/graph_manager.py index 9e23610..d13a59b 100644 --- a/rl_coach/graph_managers/graph_manager.py +++ b/rl_coach/graph_managers/graph_manager.py @@ -546,7 +546,8 @@ class GraphManager(object): # TODO: find better way to load checkpoints that were saved with a global network into the online network if self.task_parameters.checkpoint_restore_dir: - if self.task_parameters.framework_type == Frameworks.tensorflow: + if self.task_parameters.framework_type == Frameworks.tensorflow and\ + 'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_dir): # TODO-fixme checkpointing # MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so, # it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" filename