mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
first pass at kubernetes
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
REGISTRY=nervana-dockrepo01.fm.intel.com:5001/
|
||||||
IMAGE=zdwiel/coach
|
IMAGE=zdwiel/coach
|
||||||
# IMAGE=gcr.io/deep-greens/inference:v5
|
# IMAGE=gcr.io/deep-greens/inference:v5
|
||||||
|
|
||||||
@@ -43,5 +44,9 @@ run_training_worker: build
|
|||||||
run_rollout_worker: build
|
run_rollout_worker: build
|
||||||
${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/rollout_worker.py --preset CartPole_DQN_distributed
|
${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/rollout_worker.py --preset CartPole_DQN_distributed
|
||||||
|
|
||||||
push:
|
kubernetes: build push
|
||||||
docker push ${IMAGE}
|
kubectl run -i --tty --attach --image=${IMAGE} --restart=Never date -- python3 rl_coach/orchestrators/start_training.py --preset CartPole_DQN_distributed --image ${IMAGE}
|
||||||
|
|
||||||
|
push: build
|
||||||
|
${DOCKER} tag ${IMAGE} ${REGISTRY}${IMAGE}
|
||||||
|
${DOCKER} push ${REGISTRY}${IMAGE}
|
||||||
|
|||||||
48
rl_coach/orchestrators/start_training.py
Normal file
48
rl_coach/orchestrators/start_training.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes
|
||||||
|
|
||||||
|
|
||||||
|
def main(preset, image='ajaysudh/testing:coach', redis_ip='redis-service.ajay.svc'):
|
||||||
|
rollout_command = ['python3', 'rl_coach/rollout_worker.py', '-p', preset]
|
||||||
|
training_command = ['python3', 'rl_coach/training_worker.py', '-p', preset]
|
||||||
|
|
||||||
|
rollout_params = KubernetesParameters(image, rollout_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
|
||||||
|
training_params = KubernetesParameters(image, training_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
|
||||||
|
|
||||||
|
training_obj = Kubernetes(training_params)
|
||||||
|
if not training_obj.setup():
|
||||||
|
print("Could not setup")
|
||||||
|
|
||||||
|
rollout_obj = Kubernetes(training_params)
|
||||||
|
if not rollout_obj.setup():
|
||||||
|
print("Could not setup")
|
||||||
|
|
||||||
|
if training_obj.deploy():
|
||||||
|
print("Successfully deployed")
|
||||||
|
else:
|
||||||
|
print("Could not deploy")
|
||||||
|
|
||||||
|
if rollout_obj.deploy():
|
||||||
|
print("Successfully deployed")
|
||||||
|
else:
|
||||||
|
print("Could not deploy")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--image',
|
||||||
|
help="(string) Name of a docker image.",
|
||||||
|
type=str,
|
||||||
|
required=True)
|
||||||
|
parser.add_argument('-p', '--preset',
|
||||||
|
help="(string) Name of a preset to run (class name from the 'presets' directory.)",
|
||||||
|
type=str,
|
||||||
|
required=True)
|
||||||
|
# parser.add_argument('--checkpoint_dir',
|
||||||
|
# help='(string) Path to a folder containing a checkpoint to write the model to.',
|
||||||
|
# type=str,
|
||||||
|
# default='/checkpoint')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(preset=args.preset, image=args.image)
|
||||||
@@ -41,8 +41,8 @@ def wait_for_checkpoint(checkpoint_dir, timeout=10):
|
|||||||
return
|
return
|
||||||
|
|
||||||
raise ValueError((
|
raise ValueError((
|
||||||
'Waited {timeout} seconds, but checkpoint never found in'
|
'Waited {timeout} seconds, but checkpoint never found in '
|
||||||
' {checkpoint_dir}'
|
'{checkpoint_dir}'
|
||||||
).format(
|
).format(
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
checkpoint_dir=checkpoint_dir,
|
checkpoint_dir=checkpoint_dir,
|
||||||
|
|||||||
Reference in New Issue
Block a user