From 053adf0ca91bd8f4e6c3a59f96ad3aa502ab864e Mon Sep 17 00:00:00 2001 From: Scott Leishman Date: Wed, 9 Jan 2019 15:12:00 -0800 Subject: [PATCH] prevent long job CI timeouts owing to lack of EKS token refresh (#183) * add additional info during exception of eks runs. * ensure we refresh k8s config after long calls. Kubernetes client on EKS has a 10 minute token time to live, so will result in unauthorized errors if tokens are not refreshed on long jobs. --- rl_coach/tests/test_eks.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/rl_coach/tests/test_eks.py b/rl_coach/tests/test_eks.py index 1726713..2a4d9f2 100644 --- a/rl_coach/tests/test_eks.py +++ b/rl_coach/tests/test_eks.py @@ -15,12 +15,17 @@ class EKSHandler(): self.image = image self.cpu = cpu self.memory = memory - config.load_kube_config() + self.refresh_config() self.namespace = '{}-{}'.format(test_name, build_num) - self.corev1_api = client.CoreV1Api() self.create_namespace() self.working_dir = working_dir + def refresh_config(self): + # on AWS tokens only last 10 minutes so this must periodically be + # called to prevent auth related errors + config.load_kube_config() + self.corev1_api = client.CoreV1Api() + def create_namespace(self): namespace = client.V1Namespace( api_version='v1', @@ -73,13 +78,17 @@ class EKSHandler(): _preload_content=False ): print(line.decode('utf-8'), flush=True, end='') + # above call blocks for pod lifetime, so we may need to refresh tokens + self.refresh_config() except client.rest.ApiException as e: + print("Got exception: {} while reading pod logs".format(e)) pass try: pod = self.corev1_api.read_namespaced_pod(self.test_name, self.namespace) except client.rest.ApiException as e: + print("Got exception: {} while reading pod".format(e)) continue if not hasattr(pod, 'status') or not pod.status: @@ -104,6 +113,7 @@ class EKSHandler(): try: pod = self.corev1_api.read_namespaced_pod(self.test_name, self.namespace) except client.rest.ApiException as e: + print("Got exception: {} while reading pod".format(e)) return 1 if not hasattr(pod, 'status') or not pod.status: