1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 03:30:19 +01:00

Adding initial interface for backend and redis pubsub (#19)

* Adding initial interface for backend and redis pubsub

* Addressing comments, adding super in all memories

* Removing distributed experience replay
This commit is contained in:
Ajay Deshpande
2018-10-03 15:07:48 -07:00
committed by zach dwiel
parent a54ef2757f
commit 6b2de6ba6d
21 changed files with 459 additions and 444 deletions

View File

@@ -1,51 +1,43 @@
import argparse
from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes
from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, RunTypeParameters
from rl_coach.memories.backend.redis import RedisPubSubMemoryBackendParameters
def main(preset: str, image: str='ajaysudh/testing:coach', redis_ip: str=None, redis_port:int=None, num_workers: int=1, nfs_server: str="", nfs_path: str=""):
def main(preset: str, image: str='ajaysudh/testing:coach', num_workers: int=1, nfs_server: str="", nfs_path: str="", memory_backend: str=""):
rollout_command = ['python3', 'rl_coach/rollout_worker.py', '-p', preset]
training_command = ['python3', 'rl_coach/training_worker.py', '-p', preset]
"""
TODO:
1. Create a NFS backed PV for checkpointing.
a. Include that in both (worker, trainer) containers.
b. Change checkpoint writing logic to always write to a temporary file and then rename.
2. Test e2e 1 loop.
a. Trainer writes a checkpoint
b. Rollout worker picks it and gathers experience, writes back to redis.
c. 1 rollout worker, 1 trainer.
3. Trainer should be a job (not a deployment)
a. When all the epochs of training are done, workers should also be deleted.
4. Test e2e with multiple rollout workers.
5. Test e2e with multiple rollout workers and multiple loops.
"""
memory_backend_params = RedisPubSubMemoryBackendParameters()
training_params = KubernetesParameters("train", image, training_command, kubeconfig='~/.kube/config', redis_ip=redis_ip, redis_port=redis_port,
nfs_server=nfs_server, nfs_path=nfs_path)
training_obj = Kubernetes(training_params)
if not training_obj.setup():
worker_run_type_params = RunTypeParameters(image, rollout_command, run_type="worker")
trainer_run_type_params = RunTypeParameters(image, training_command, run_type="trainer")
orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', nfs_server=nfs_server,
nfs_path=nfs_path, memory_backend_parameters=memory_backend_params)
orchestrator = Kubernetes(orchestration_params)
if not orchestrator.setup():
print("Could not setup")
return
rollout_params = KubernetesParameters("worker", image, rollout_command, kubeconfig='~/.kube/config', redis_ip=training_params.redis_ip, redis_port=training_params.redis_port, num_workers=num_workers)
rollout_obj = Kubernetes(rollout_params)
# if not rollout_obj.setup():
# print("Could not setup")
if training_obj.deploy():
if orchestrator.deploy_trainer():
print("Successfully deployed")
else:
print("Could not deploy")
return
if rollout_obj.deploy():
if orchestrator.deploy_worker():
print("Successfully deployed")
else:
print("Could not deploy")
return
try:
orchestrator.trainer_logs()
except KeyboardInterrupt:
pass
orchestrator.undeploy()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
@@ -65,6 +57,10 @@ if __name__ == '__main__':
help="(string) Exported path for the nfs server",
type=str,
required=True)
parser.add_argument('--memory_backend',
help="(string) Memory backend to use",
type=str,
default="redispubsub")
# parser.add_argument('--checkpoint_dir',
# help='(string) Path to a folder containing a checkpoint to write the model to.',
@@ -72,4 +68,4 @@ if __name__ == '__main__':
# default='/checkpoint')
args = parser.parse_args()
main(preset=args.preset, image=args.image, nfs_server=args.nfs_server, nfs_path=args.nfs_path)
main(preset=args.preset, image=args.image, nfs_server=args.nfs_server, nfs_path=args.nfs_path, memory_backend=args.memory_backend)