mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 03:30:19 +01:00
Adding nfs pv, pvc, waiting for memory to be full
This commit is contained in:
committed by
zach dwiel
parent
13d81f65b9
commit
98850464cc
@@ -3,30 +3,48 @@ import argparse
|
||||
from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes
|
||||
|
||||
|
||||
def main(preset, image='ajaysudh/testing:coach', redis_ip='redis-service.ajay.svc'):
|
||||
def main(preset: str, image: str='ajaysudh/testing:coach', redis_ip: str=None, redis_port:int=None, num_workers: int=1, nfs_server: str="", nfs_path: str=""):
|
||||
rollout_command = ['python3', 'rl_coach/rollout_worker.py', '-p', preset]
|
||||
training_command = ['python3', 'rl_coach/training_worker.py', '-p', preset]
|
||||
|
||||
rollout_params = KubernetesParameters(image, rollout_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
|
||||
training_params = KubernetesParameters(image, training_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
|
||||
"""
|
||||
TODO:
|
||||
1. Create a NFS backed PV for checkpointing.
|
||||
a. Include that in both (worker, trainer) containers.
|
||||
b. Change checkpoint writing logic to always write to a temporary file and then rename.
|
||||
2. Test e2e 1 loop.
|
||||
a. Trainer writes a checkpoint
|
||||
b. Rollout worker picks it and gathers experience, writes back to redis.
|
||||
c. 1 rollout worker, 1 trainer.
|
||||
3. Trainer should be a job (not a deployment)
|
||||
a. When all the epochs of training are done, workers should also be deleted.
|
||||
4. Test e2e with multiple rollout workers.
|
||||
5. Test e2e with multiple rollout workers and multiple loops.
|
||||
"""
|
||||
|
||||
training_params = KubernetesParameters("train", image, training_command, kubeconfig='~/.kube/config', redis_ip=redis_ip, redis_port=redis_port,
|
||||
nfs_server=nfs_server, nfs_path=nfs_path)
|
||||
training_obj = Kubernetes(training_params)
|
||||
if not training_obj.setup():
|
||||
print("Could not setup")
|
||||
return
|
||||
|
||||
rollout_obj = Kubernetes(training_params)
|
||||
if not rollout_obj.setup():
|
||||
print("Could not setup")
|
||||
rollout_params = KubernetesParameters("worker", image, rollout_command, kubeconfig='~/.kube/config', redis_ip=training_params.redis_ip, redis_port=training_params.redis_port, num_workers=num_workers)
|
||||
rollout_obj = Kubernetes(rollout_params)
|
||||
# if not rollout_obj.setup():
|
||||
# print("Could not setup")
|
||||
|
||||
if training_obj.deploy():
|
||||
print("Successfully deployed")
|
||||
else:
|
||||
print("Could not deploy")
|
||||
return
|
||||
|
||||
if rollout_obj.deploy():
|
||||
print("Successfully deployed")
|
||||
else:
|
||||
print("Could not deploy")
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -39,10 +57,19 @@ if __name__ == '__main__':
|
||||
help="(string) Name of a preset to run (class name from the 'presets' directory.)",
|
||||
type=str,
|
||||
required=True)
|
||||
parser.add_argument('-ns', '--nfs-server',
|
||||
help="(string) Addresss of the nfs server.)",
|
||||
type=str,
|
||||
required=True)
|
||||
parser.add_argument('-np', '--nfs-path',
|
||||
help="(string) Exported path for the nfs server",
|
||||
type=str,
|
||||
required=True)
|
||||
|
||||
# parser.add_argument('--checkpoint_dir',
|
||||
# help='(string) Path to a folder containing a checkpoint to write the model to.',
|
||||
# type=str,
|
||||
# default='/checkpoint')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(preset=args.preset, image=args.image)
|
||||
main(preset=args.preset, image=args.image, nfs_server=args.nfs_server, nfs_path=args.nfs_path)
|
||||
|
||||
Reference in New Issue
Block a user