1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 11:40:18 +01:00

Add documentation on distributed Coach. (#158)

* Added documentation on distributed Coach.
This commit is contained in:
Balaji Subramaniam
2018-11-27 02:26:15 -08:00
committed by Gal Novik
parent e3ecf445e2
commit d06197f663
151 changed files with 5302 additions and 643 deletions

View File

@@ -54,8 +54,17 @@ class KubernetesParameters(DeployParameters):
class Kubernetes(Deploy):
"""
An orchestrator implmentation which uses Kubernetes to deploy the components such as training and rollout workers
and Redis Pub/Sub in Coach when used in the distributed mode.
"""
def __init__(self, params: KubernetesParameters):
"""
:param params: The Kubernetes parameters which are used for deploying the components in Coach. These parameters
include namespace and kubeconfig.
"""
super().__init__(params)
self.params = params
if self.params.kubeconfig:
@@ -93,6 +102,9 @@ class Kubernetes(Deploy):
self.s3_secret_key = os.environ.get('SECRET_ACCESS_KEY')
def setup(self) -> bool:
"""
Deploys the memory backend and data stores if required.
"""
self.memory_backend.deploy()
if not self.data_store.deploy():
@@ -102,6 +114,9 @@ class Kubernetes(Deploy):
return True
def deploy_trainer(self) -> bool:
"""
Deploys the training worker in Kubernetes.
"""
trainer_params = self.params.run_type_params.get(str(RunType.TRAINER), None)
if not trainer_params:
@@ -179,6 +194,9 @@ class Kubernetes(Deploy):
return False
def deploy_worker(self):
"""
Deploys the rollout worker(s) in Kubernetes.
"""
worker_params = self.params.run_type_params.get(str(RunType.ROLLOUT_WORKER), None)
if not worker_params:
@@ -258,6 +276,9 @@ class Kubernetes(Deploy):
return False
def worker_logs(self, path='./logs'):
"""
:param path: Path to store the worker logs.
"""
worker_params = self.params.run_type_params.get(str(RunType.ROLLOUT_WORKER), None)
if not worker_params:
return
@@ -288,6 +309,9 @@ class Kubernetes(Deploy):
self.tail_log(pod_name, api_client)
def trainer_logs(self):
"""
Get the logs from trainer.
"""
trainer_params = self.params.run_type_params.get(str(RunType.TRAINER), None)
if not trainer_params:
return
@@ -346,6 +370,10 @@ class Kubernetes(Deploy):
return
def undeploy(self):
"""
Undeploy all the components, such as trainer and rollout worker(s), Redis pub/sub and data store, when required.
"""
trainer_params = self.params.run_type_params.get(str(RunType.TRAINER), None)
api_client = k8sclient.BatchV1Api()
delete_options = k8sclient.V1DeleteOptions(