1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00

workaround for resolving the issue of restoring a multi-node training checkpoint to single worker (#156)

This commit is contained in:
Gal Leibovich
2018-11-26 00:08:43 +02:00
committed by GitHub
parent ab10852ad9
commit 5674749ed5

View File

@@ -546,7 +546,8 @@ class GraphManager(object):
# TODO: find better way to load checkpoints that were saved with a global network into the online network
if self.task_parameters.checkpoint_restore_dir:
if self.task_parameters.framework_type == Frameworks.tensorflow:
if self.task_parameters.framework_type == Frameworks.tensorflow and\
'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_dir):
# TODO-fixme checkpointing
# MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so,
# it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" filename