Adding nfs pv, pvc, waiting for memory to be full

2026-02-11 19:25:53 +01:00 · 2018-09-19 09:03:11 -07:00
parent 13d81f65b9
commit 98850464cc
4 changed files with 188 additions and 31 deletions
--- a/rl_coach/orchestrators/start_training.py
+++ b/rl_coach/orchestrators/start_training.py
@@ -3,30 +3,48 @@ import argparse
 from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes


-def main(preset, image='ajaysudh/testing:coach', redis_ip='redis-service.ajay.svc'):
+def main(preset: str, image: str='ajaysudh/testing:coach', redis_ip: str=None, redis_port:int=None, num_workers: int=1, nfs_server: str="", nfs_path: str=""):
    rollout_command = ['python3', 'rl_coach/rollout_worker.py', '-p', preset]
    training_command = ['python3', 'rl_coach/training_worker.py', '-p', preset]

-    rollout_params = KubernetesParameters(image, rollout_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
-    training_params = KubernetesParameters(image, training_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
+    """
+    TODO:
+    1. Create a NFS backed PV for checkpointing.
+        a. Include that in both (worker, trainer) containers.
+        b. Change checkpoint writing logic to always write to a temporary file and then rename.
+    2. Test e2e 1 loop.
+        a. Trainer writes a checkpoint
+        b. Rollout worker picks it and gathers experience, writes back to redis.
+        c. 1 rollout worker, 1 trainer.
+    3. Trainer should be a job (not a deployment)
+        a. When all the epochs of training are done, workers should also be deleted.
+    4. Test e2e with multiple rollout workers.
+    5. Test e2e with multiple rollout workers and multiple loops.
+    """

+    training_params = KubernetesParameters("train", image, training_command, kubeconfig='~/.kube/config', redis_ip=redis_ip, redis_port=redis_port,
+                                           nfs_server=nfs_server, nfs_path=nfs_path)
    training_obj = Kubernetes(training_params)
    if not training_obj.setup():
        print("Could not setup")
+        return

-    rollout_obj = Kubernetes(training_params)
-    if not rollout_obj.setup():
-        print("Could not setup")
+    rollout_params = KubernetesParameters("worker", image, rollout_command, kubeconfig='~/.kube/config', redis_ip=training_params.redis_ip, redis_port=training_params.redis_port, num_workers=num_workers)
+    rollout_obj = Kubernetes(rollout_params)
+    # if not rollout_obj.setup():
+    #     print("Could not setup")

    if training_obj.deploy():
        print("Successfully deployed")
    else:
        print("Could not deploy")
+        return

    if rollout_obj.deploy():
        print("Successfully deployed")
    else:
        print("Could not deploy")
+        return


 if __name__ == '__main__':
@@ -39,10 +57,19 @@ if __name__ == '__main__':
                        help="(string) Name of a preset to run (class name from the 'presets' directory.)",
                        type=str,
                        required=True)
+    parser.add_argument('-ns', '--nfs-server',
+                        help="(string) Addresss of the nfs server.)",
+                        type=str,
+                        required=True)
+    parser.add_argument('-np', '--nfs-path',
+                        help="(string) Exported path for the nfs server",
+                        type=str,
+                        required=True)
+
    # parser.add_argument('--checkpoint_dir',
    #                     help='(string) Path to a folder containing a checkpoint to write the model to.',
    #                     type=str,
    #                     default='/checkpoint')
    args = parser.parse_args()

-    main(preset=args.preset, image=args.image)
+    main(preset=args.preset, image=args.image, nfs_server=args.nfs_server, nfs_path=args.nfs_path)