workaround for resolving the issue of restoring a multi-node training checkpoint to single worker (#156)

2026-02-18 15:35:56 +01:00 · 2018-11-26 00:08:43 +02:00
parent ab10852ad9
commit 5674749ed5
1 changed files with 2 additions and 1 deletions
--- a/rl_coach/graph_managers/graph_manager.py
+++ b/rl_coach/graph_managers/graph_manager.py
@@ -546,7 +546,8 @@ class GraphManager(object):

        # TODO: find better way to load checkpoints that were saved with a global network into the online network
        if self.task_parameters.checkpoint_restore_dir:
-            if self.task_parameters.framework_type == Frameworks.tensorflow:
+            if self.task_parameters.framework_type == Frameworks.tensorflow and\
+                    'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_dir):
                # TODO-fixme checkpointing
                # MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so,
                # it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" filename