1
0
mirror of https://github.com/gryf/coach.git synced 2026-03-18 15:53:35 +01:00

Fixes for having NumpySharedRunningStats syncing on multi-node (#139)

1. Having the standard checkpoint prefix in order for the data store to grab it, and sync it to S3.
2. Removing the reference to Redis so that it won't try to pickle that in.
3. Enable restoring a checkpoint into a single-worker run, which was saved by a single-node-multiple-worker run.
This commit is contained in:
Gal Leibovich
2018-11-23 16:11:47 +02:00
committed by GitHub
parent 87a7848b0a
commit a1c56edd98
12 changed files with 154 additions and 99 deletions

View File

@@ -269,14 +269,19 @@ class QDND(object):
def load_dnd(model_dir):
max_id = 0
latest_checkpoint_id = -1
latest_checkpoint = ''
# get all checkpoint files
for fname in os.listdir(model_dir):
path = os.path.join(model_dir, fname)
if os.path.isdir(path) or fname.split('.')[-1] != 'srs':
continue
checkpoint_id = int(fname.split('_')[0])
if checkpoint_id > latest_checkpoint_id:
latest_checkpoint = fname
latest_checkpoint_id = checkpoint_id
for f in [s for s in os.listdir(model_dir) if s.endswith('.dnd')]:
if int(f.split('.')[0]) > max_id:
max_id = int(f.split('.')[0])
model_path = str(max_id) + '.dnd'
with open(os.path.join(model_dir, model_path), 'rb') as f:
with open(os.path.join(model_dir, str(latest_checkpoint)), 'rb') as f:
DND = pickle.load(f)
for a in range(DND.num_actions):