1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00

first pass at kubernetes

This commit is contained in:
Zach Dwiel
2018-09-17 22:31:17 +00:00
committed by zach dwiel
parent 3328b25549
commit 0812a94fbd
3 changed files with 57 additions and 4 deletions

View File

@@ -1,3 +1,4 @@
REGISTRY=nervana-dockrepo01.fm.intel.com:5001/
IMAGE=zdwiel/coach IMAGE=zdwiel/coach
# IMAGE=gcr.io/deep-greens/inference:v5 # IMAGE=gcr.io/deep-greens/inference:v5
@@ -43,5 +44,9 @@ run_training_worker: build
run_rollout_worker: build run_rollout_worker: build
${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/rollout_worker.py --preset CartPole_DQN_distributed ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/rollout_worker.py --preset CartPole_DQN_distributed
push: kubernetes: build push
docker push ${IMAGE} kubectl run -i --tty --attach --image=${IMAGE} --restart=Never date -- python3 rl_coach/orchestrators/start_training.py --preset CartPole_DQN_distributed --image ${IMAGE}
push: build
${DOCKER} tag ${IMAGE} ${REGISTRY}${IMAGE}
${DOCKER} push ${REGISTRY}${IMAGE}

View File

@@ -0,0 +1,48 @@
import argparse
from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes
def main(preset, image='ajaysudh/testing:coach', redis_ip='redis-service.ajay.svc'):
rollout_command = ['python3', 'rl_coach/rollout_worker.py', '-p', preset]
training_command = ['python3', 'rl_coach/training_worker.py', '-p', preset]
rollout_params = KubernetesParameters(image, rollout_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
training_params = KubernetesParameters(image, training_command, redis_ip=redis_ip, redis_port=6379, num_workers=1)
training_obj = Kubernetes(training_params)
if not training_obj.setup():
print("Could not setup")
rollout_obj = Kubernetes(training_params)
if not rollout_obj.setup():
print("Could not setup")
if training_obj.deploy():
print("Successfully deployed")
else:
print("Could not deploy")
if rollout_obj.deploy():
print("Successfully deployed")
else:
print("Could not deploy")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image',
help="(string) Name of a docker image.",
type=str,
required=True)
parser.add_argument('-p', '--preset',
help="(string) Name of a preset to run (class name from the 'presets' directory.)",
type=str,
required=True)
# parser.add_argument('--checkpoint_dir',
# help='(string) Path to a folder containing a checkpoint to write the model to.',
# type=str,
# default='/checkpoint')
args = parser.parse_args()
main(preset=args.preset, image=args.image)

View File

@@ -41,8 +41,8 @@ def wait_for_checkpoint(checkpoint_dir, timeout=10):
return return
raise ValueError(( raise ValueError((
'Waited {timeout} seconds, but checkpoint never found in' 'Waited {timeout} seconds, but checkpoint never found in '
' {checkpoint_dir}' '{checkpoint_dir}'
).format( ).format(
timeout=timeout, timeout=timeout,
checkpoint_dir=checkpoint_dir, checkpoint_dir=checkpoint_dir,