Adding initial interface for backend and redis pubsub (#19)

* Adding initial interface for backend and redis pubsub * Addressing comments, adding super in all memories * Removing distributed experience replay
2026-02-14 12:55:51 +01:00 · 2018-10-03 15:07:48 -07:00
parent a54ef2757f
commit 6b2de6ba6d
21 changed files with 459 additions and 444 deletions
--- a/rl_coach/orchestrators/start_training.py
+++ b/rl_coach/orchestrators/start_training.py
@@ -1,51 +1,43 @@
 import argparse

-from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes
+from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, RunTypeParameters
+from rl_coach.memories.backend.redis import RedisPubSubMemoryBackendParameters


-def main(preset: str, image: str='ajaysudh/testing:coach', redis_ip: str=None, redis_port:int=None, num_workers: int=1, nfs_server: str="", nfs_path: str=""):
+def main(preset: str, image: str='ajaysudh/testing:coach', num_workers: int=1, nfs_server: str="", nfs_path: str="", memory_backend: str=""):
    rollout_command = ['python3', 'rl_coach/rollout_worker.py', '-p', preset]
    training_command = ['python3', 'rl_coach/training_worker.py', '-p', preset]

-    """
-    TODO:
-    1. Create a NFS backed PV for checkpointing.
-        a. Include that in both (worker, trainer) containers.
-        b. Change checkpoint writing logic to always write to a temporary file and then rename.
-    2. Test e2e 1 loop.
-        a. Trainer writes a checkpoint
-        b. Rollout worker picks it and gathers experience, writes back to redis.
-        c. 1 rollout worker, 1 trainer.
-    3. Trainer should be a job (not a deployment)
-        a. When all the epochs of training are done, workers should also be deleted.
-    4. Test e2e with multiple rollout workers.
-    5. Test e2e with multiple rollout workers and multiple loops.
-    """
+    memory_backend_params = RedisPubSubMemoryBackendParameters()

-    training_params = KubernetesParameters("train", image, training_command, kubeconfig='~/.kube/config', redis_ip=redis_ip, redis_port=redis_port,
-                                           nfs_server=nfs_server, nfs_path=nfs_path)
-    training_obj = Kubernetes(training_params)
-    if not training_obj.setup():
+    worker_run_type_params = RunTypeParameters(image, rollout_command, run_type="worker")
+    trainer_run_type_params = RunTypeParameters(image, training_command, run_type="trainer")
+
+    orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', nfs_server=nfs_server,
+                                                nfs_path=nfs_path, memory_backend_parameters=memory_backend_params)
+    orchestrator = Kubernetes(orchestration_params)
+    if not orchestrator.setup():
        print("Could not setup")
        return

-    rollout_params = KubernetesParameters("worker", image, rollout_command, kubeconfig='~/.kube/config', redis_ip=training_params.redis_ip, redis_port=training_params.redis_port, num_workers=num_workers)
-    rollout_obj = Kubernetes(rollout_params)
-    # if not rollout_obj.setup():
-    #     print("Could not setup")
-
-    if training_obj.deploy():
+    if orchestrator.deploy_trainer():
        print("Successfully deployed")
    else:
        print("Could not deploy")
        return

-    if rollout_obj.deploy():
+    if orchestrator.deploy_worker():
        print("Successfully deployed")
    else:
        print("Could not deploy")
        return

+    try:
+        orchestrator.trainer_logs()
+    except KeyboardInterrupt:
+        pass
+    orchestrator.undeploy()
+

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
@@ -65,6 +57,10 @@ if __name__ == '__main__':
                        help="(string) Exported path for the nfs server",
                        type=str,
                        required=True)
+    parser.add_argument('--memory_backend',
+                        help="(string) Memory backend to use",
+                        type=str,
+                        default="redispubsub")

    # parser.add_argument('--checkpoint_dir',
    #                     help='(string) Path to a folder containing a checkpoint to write the model to.',
@@ -72,4 +68,4 @@ if __name__ == '__main__':
    #                     default='/checkpoint')
    args = parser.parse_args()

-    main(preset=args.preset, image=args.image, nfs_server=args.nfs_server, nfs_path=args.nfs_path)
+    main(preset=args.preset, image=args.image, nfs_server=args.nfs_server, nfs_path=args.nfs_path, memory_backend=args.memory_backend)