diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..fdac537 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,136 @@ +aliases: + - &executor_prep + docker: + - image: circleci/python:3.7.0-stretch + working_directory: ~/repo + - &remote_docker + # ensure layers of constructed docker containers are cached for reuse between jobs. + setup_remote_docker: + docker_layer_caching: true + - &restore_cache + restore_cache: + keys: + - v1-dependencies-{{ checksum "requirements.txt" }} + # fallback to using the latest cache if no exact match is found + - v1-dependencies- + - &save_cache + save_cache: + paths: + - ./venv + key: v1-dependencies-{{ checksum "requirements.txt" }} + - &aws_prep + run: + name: Prepare aws cli + command: | + sudo pip install awscli pytest kubernetes==8.0.0b1 + export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode` + export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode` + + $(aws ecr get-login --no-include-email --region us-west-2) + sudo curl -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.10.3/2018-07-26/bin/linux/amd64/aws-iam-authenticator + sudo chmod a+x /usr/local/bin/aws-iam-authenticator + aws eks update-kubeconfig --name coach-aws-cicd + +version: 2 +jobs: + build: + <<: *executor_prep + steps: + - checkout + - *remote_docker + - *restore_cache + - *aws_prep + - run: + name: Build and push container + command: | + REGISTRY=316971102342.dkr.ecr.us-west-2.amazonaws.com + TAG=$(git describe --tags --always --dirty) + + docker pull ${REGISTRY}/coach-base:${MASTER_BRANCH} + docker build --cache-from ${REGISTRY}/coach-base:${MASTER_BRANCH} -t ${REGISTRY}/coach-base:${TAG} -f docker/Dockerfile.base . + + docker push ${REGISTRY}/coach-base:${TAG} + + docker tag ${REGISTRY}/coach-base:${TAG} coach-base:master + + docker build -t ${REGISTRY}/coach:${TAG} -f docker/Dockerfile . + docker push ${REGISTRY}/coach:${TAG} + no_output_timeout: 30m + + unit_tests: + <<: *executor_prep + steps: + - checkout + - *remote_docker + - *restore_cache + - *aws_prep + - run: + name: run unit tests + command: | + export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode` + export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode` + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn unit-test -tc 'make unit_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + + integration_tests: + <<: *executor_prep + steps: + - checkout + - *remote_docker + - *restore_cache + - *aws_prep + - run: + name: run integration tests + command: | + export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode` + export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode` + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn integration-test -tc 'make integration_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + + golden_tests: + <<: *executor_prep + steps: + - checkout + - *remote_docker + - *restore_cache + - *aws_prep + - run: + name: run golden tests + command: | + export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode` + export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode` + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test -tc 'make golden_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + + trace_tests: + <<: *executor_prep + steps: + - checkout + - *remote_docker + - *restore_cache + - *aws_prep + - run: + name: run trace tests + command: | + export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode` + export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode` + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test -tc 'make trace_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + +workflows: + version: 2 + build_and_test: + jobs: + - build + - unit_tests: + requires: + - build + - integration_tests: + requires: + - build + - e2e_approval: + type: approval + requires: + - build + - golden_tests: + requires: + - e2e_approval + - trace_tests: + requires: + - e2e_approval diff --git a/.gitignore b/.gitignore index 3d028c9..39eaaf1 100644 --- a/.gitignore +++ b/.gitignore @@ -20,12 +20,12 @@ rl_coach.egg* contrib test_log_* dist +.DS_Store datasets .cache .pytest_cache core trace_test* -.DS_Store *.swp *.swo .cache/ diff --git a/docker/Dockerfile b/docker/Dockerfile index b3de1a4..1fb5dc3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,120 +1,4 @@ -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -# https://github.com/NVIDIA/nvidia-docker/issues/619 -RUN rm /etc/apt/sources.list.d/cuda.list -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get clean autoclean && \ - apt-get autoremove -y -RUN apt-get update && \ - apt-get install -y python-pip && \ - apt-get clean autoclean && \ - apt-get autoremove -y -RUN pip install pip --upgrade -WORKDIR /root - -################################ -# Install apt-get Requirements # -################################ - -# General -RUN apt-get update && \ - apt-get install -y python3-pip cmake zlib1g-dev python3-tk python-opencv && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# Boost libraries -RUN apt-get update && \ - apt-get install -y libboost-all-dev && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# Scipy requirements -RUN apt-get update && \ - apt-get install -y libblas-dev liblapack-dev libatlas-base-dev gfortran && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# Pygame requirements -RUN apt-get update && \ - apt-get install -y libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev && \ - apt-get clean autoclean && \ - apt-get autoremove -y -RUN apt-get update && \ - apt-get install -y libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# Dashboard -RUN apt-get update && \ - apt-get install -y dpkg-dev build-essential python3.5-dev libjpeg-dev libtiff-dev libsdl1.2-dev libnotify-dev \ - freeglut3 freeglut3-dev libsm-dev libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev \ - libwebkitgtk-3.0-dev libgstreamer-plugins-base1.0-dev && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# Gym -RUN apt-get update && \ - apt-get install -y libav-tools libsdl2-dev swig cmake && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# Mujoco_py -RUN apt-get update && \ - apt-get install -y curl libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev software-properties-common && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -# ViZDoom -RUN apt-get update && \ - apt-get install -y build-essential zlib1g-dev libsdl2-dev libjpeg-dev \ - nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \ - libopenal-dev timidity libwildmidi-dev unzip wget && \ - apt-get clean autoclean && \ - apt-get autoremove -y - -############################ -# Install Pip Requirements # -############################ -RUN pip3 install --upgrade pip -RUN pip3 install pytest -RUN pip3 install pytest-xdist - -# initial installation of coach, so that the docker build won't install everything from scratch -RUN pip3 install rl_coach>=0.10.0 - -# install additional environments -RUN pip3 install gym[atari]==0.10.5 -RUN pip3 install mujoco_py==1.50.1.56 -RUN pip3 install vizdoom==1.1.6 - -# FROM ubuntu:16.04 -# -# RUN apt-get update \ -# && apt-get install -y \ -# python3-pip cmake zlib1g-dev python3-tk python-opencv \ -# libboost-all-dev \ -# libblas-dev liblapack-dev libatlas-base-dev gfortran \ -# libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev \ -# libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev \ -# dpkg-dev build-essential python3.5-dev libjpeg-dev libtiff-dev \ -# libsdl1.2-dev libnotify-dev freeglut3 freeglut3-dev libsm-dev \ -# libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev \ -# libwebkitgtk-3.0-dev libgstreamer-plugins-base1.0-dev \ -# libav-tools libsdl2-dev swig -# -# # installing python dependencies -# RUN pip3 install --upgrade pip - -RUN apt-get update && apt-get install -y wget zip -RUN mkdir -p ~/.mujoco \ - && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \ - && unzip mujoco.zip -d ~/.mujoco \ - && rm mujoco.zip -ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH - -RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \ - && chmod +x /usr/local/bin/patchelf +FROM coach-base:master RUN mkdir /root/src COPY setup.py /root/src/. diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base new file mode 100644 index 0000000..8659096 --- /dev/null +++ b/docker/Dockerfile.base @@ -0,0 +1,63 @@ +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +# https://github.com/NVIDIA/nvidia-docker/issues/619 +RUN rm /etc/apt/sources.list.d/cuda.list +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get clean autoclean && \ + apt-get autoremove -y && apt-get update && \ + apt-get install -y python-pip && \ + apt-get clean autoclean && \ + apt-get autoremove -y +RUN pip install pip --upgrade +WORKDIR /root + +################################ +# Install apt-get Requirements # +################################ + +# General +RUN apt-get update && \ + apt-get install -y python3-pip cmake zlib1g-dev python3-tk python-opencv \ + # Boost libraries + libboost-all-dev \ + # Scipy requirements + libblas-dev liblapack-dev libatlas-base-dev gfortran \ + # Pygame requirements + libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev \ + libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev \ + # Dashboard + dpkg-dev build-essential python3.5-dev libjpeg-dev libtiff-dev libsdl1.2-dev libnotify-dev \ + freeglut3 freeglut3-dev libsm-dev libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev \ + libwebkitgtk-3.0-dev libgstreamer-plugins-base1.0-dev \ + # Gym + libav-tools libsdl2-dev swig cmake \ + # Mujoco_py + curl libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev software-properties-common \ + # ViZDoom + build-essential zlib1g-dev libsdl2-dev libjpeg-dev \ + nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \ + libopenal-dev timidity libwildmidi-dev unzip wget && \ + apt-get clean autoclean && \ + apt-get autoremove -y + +############################ +# Install Pip Requirements # +############################ +RUN pip3 install --upgrade pip +RUN pip3 install pytest +RUN pip3 install pytest-xdist + +# initial installation of coach, so that the docker build won't install everything from scratch +RUN pip3 install rl_coach>=0.10.0 && pip3 install gym[atari]==0.10.5 && \ + pip3 install mujoco_py==1.50.1.56 && pip3 install vizdoom==1.1.6 + +RUN mkdir -p ~/.mujoco \ + && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \ + && unzip mujoco.zip -d ~/.mujoco \ + && rm mujoco.zip +# COPY ./mjkey.txt /root/.mujoco/ +ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH + +RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \ + && chmod +x /usr/local/bin/patchelf diff --git a/docker/Makefile b/docker/Makefile index 3fd4b71..c409f8a 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -20,6 +20,11 @@ RUN_ARGUMENTS+=--rm RUN_ARGUMENTS+=--net host RUN_ARGUMENTS+=-v /tmp/checkpoint:/checkpoint +UNIT_TESTS=python3 -m pytest rl_coach/tests -m unit_test +INTEGRATION_TESTS=python3 -m pytest rl_coach/tests -m integration_test -n auto --tb=short +GOLDEN_TESTS=python3 -m pytest rl_coach/tests -m golden_test -n auto +TRACE_TESTS=python3 rl_coach/tests/trace_tests.py -prl + CONTEXT = $(realpath ..) ifndef DOCKER @@ -35,17 +40,16 @@ shell: build ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} /bin/bash unit_tests: build - ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 -m pytest rl_coach/tests -m unit_test -n 8 + ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${UNIT_TESTS} -n 8 integration_tests: build - ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 -m pytest rl_coach/tests -m integration_test -n auto --tb=short + ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${INTEGRATION_TESTS} golden_tests: build - # ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/tests/golden_tests.py - time ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 -m pytest rl_coach/tests -m golden_test -n auto + ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${GOLDEN_TESTS} trace_tests: build - ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/tests/trace_tests.py -prl + ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${TRACE_TESTS} run: build ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} @@ -73,3 +77,15 @@ kubernetes: stop_kubernetes push: build ${DOCKER} tag ${IMAGE} ${REGISTRY}${IMAGE} ${DOCKER} push ${REGISTRY}${IMAGE} + +unit_tests_without_docker: + cd .. && ${UNIT_TESTS} + +integration_tests_without_docker: + cd .. && ${INTEGRATION_TESTS} + +golden_tests_without_docker: + cd .. && ${GOLDEN_TESTS} + +trace_tests_without_docker: + cd .. && ${TRACE_TESTS} diff --git a/docker/docker_entrypoint.sh b/docker/docker_entrypoint.sh index 9b19f90..feccda2 100644 --- a/docker/docker_entrypoint.sh +++ b/docker/docker_entrypoint.sh @@ -16,6 +16,4 @@ set -e export VIZDOOM_ROOT=`pip show vizdoom 2>/dev/null | awk '/Location/{print $2}'`/vizdoom -cd /root/src/ - -exec "$@" +bash -c "$@" diff --git a/requirements.txt b/requirements.txt index acd5ff7..90c6785 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ gym==0.10.5 bokeh==0.13.0 futures==3.1.1 wxPython==4.0.1 -kubernetes==7.0.0 +kubernetes==8.0.0b1 redis==2.10.6 minio==4.0.5 +pytest==3.8.2 diff --git a/rl_coach/graph_managers/graph_manager.py b/rl_coach/graph_managers/graph_manager.py index 278d5f0..4b5ae9f 100644 --- a/rl_coach/graph_managers/graph_manager.py +++ b/rl_coach/graph_managers/graph_manager.py @@ -364,7 +364,6 @@ class GraphManager(object): if self.agent_params.memory.memory_backend_params.run_type == "worker": data_store = get_data_store(self.data_store_params) data_store.load_from_store() - # perform several steps of playing count_end = self.current_step_counter + steps while self.current_step_counter < count_end: diff --git a/rl_coach/presets/Doom_Basic_DQN.py b/rl_coach/presets/Doom_Basic_DQN.py index 6c551d5..751a32b 100644 --- a/rl_coach/presets/Doom_Basic_DQN.py +++ b/rl_coach/presets/Doom_Basic_DQN.py @@ -36,7 +36,6 @@ agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False ############### env_params = DoomEnvironmentParameters(level='basic') - ######## # Test # ######## diff --git a/rl_coach/tests/test_eks.py b/rl_coach/tests/test_eks.py new file mode 100644 index 0000000..f75ab04 --- /dev/null +++ b/rl_coach/tests/test_eks.py @@ -0,0 +1,183 @@ + +import argparse +import pytest +import time +from kubernetes import client, config + + +class EKSHandler(): + + def __init__(self, cluster, build_num, test_name, test_command, image, cpu, memory, working_dir): + self.cluster = cluster + self.build_num = build_num + self.test_name = test_name + self.test_command = test_command + self.image = image + self.cpu = cpu + self.memory = memory + config.load_kube_config() + self.namespace = '{}-{}'.format(test_name, build_num) + self.corev1_api = client.CoreV1Api() + self.create_namespace() + self.working_dir = working_dir + + def create_namespace(self): + namespace = client.V1Namespace( + api_version='v1', + kind="Namespace", + metadata=client.V1ObjectMeta(name=self.namespace) + ) + + try: + self.corev1_api.create_namespace(namespace) + except client.rest.ApiException as e: + raise RuntimeError("Failed to create namesapce. Got exception: {}".format(e)) + + def deploy(self): + container = client.V1Container( + name=self.test_name, + image=self.image, + args=[self.test_command], + image_pull_policy='Always', + working_dir=self.working_dir, + stdin=True, + tty=True + ) + pod_spec = client.V1PodSpec( + containers=[container], + restart_policy='Never' + ) + pod = client.V1Pod( + api_version="v1", + kind="Pod", + metadata=client.V1ObjectMeta(name=self.test_name), + spec=pod_spec + ) + + try: + self.corev1_api.create_namespaced_pod(self.namespace, pod) + except client.rest.ApiException as e: + print("Got exception: {} while creating a pod".format(e)) + return 1 + + return 0 + + def print_logs(self): + while True: + time.sleep(10) + # Try to tail the pod logs + try: + for line in self.corev1_api.read_namespaced_pod_log( + self.test_name, self.namespace, follow=True, + _preload_content=False + ): + print(line.decode('utf-8'), flush=True, end='') + + except client.rest.ApiException as e: + pass + + try: + pod = self.corev1_api.read_namespaced_pod(self.test_name, self.namespace) + except client.rest.ApiException as e: + continue + + if not hasattr(pod, 'status') or not pod.status: + continue + if not hasattr(pod.status, 'container_statuses') or not pod.status.container_statuses: + continue + + for container_status in pod.status.container_statuses: + if container_status.state.waiting is not None: + if container_status.state.waiting.reason == 'Error' or \ + container_status.state.waiting.reason == 'CrashLoopBackOff' or \ + container_status.state.waiting.reason == 'ImagePullBackOff' or \ + container_status.state.waiting.reason == 'ErrImagePull': + return + if container_status.state.terminated is not None: + return + + def get_return_status(self): + # This part will get executed if the pod is one of the following phases: not ready, failed or terminated. + # Check if the pod has errored out, else just try again. + # Get the pod + try: + pod = self.corev1_api.read_namespaced_pod(self.test_name, self.namespace) + except client.rest.ApiException as e: + return 1 + + if not hasattr(pod, 'status') or not pod.status: + return 0 + if not hasattr(pod.status, 'container_statuses') or not pod.status.container_statuses: + return 0 + + for container_status in pod.status.container_statuses: + if container_status.state.waiting is not None: + if container_status.state.waiting.reason == 'Error' or \ + container_status.state.waiting.reason == 'CrashLoopBackOff' or \ + container_status.state.waiting.reason == 'ImagePullBackOff' or \ + container_status.state.waiting.reason == 'ErrImagePull': + return 1 + if container_status.state.terminated is not None: + return container_status.state.terminated.exit_code + + def cleanup(self): + + # Delete pod + try: + self.corev1_api.delete_namespaced_pod(self.test_name, self.namespace, client.V1DeleteOptions()) + except client.rest.ApiException as e: + print("Got exception while deleting pod: {}".format(e)) + + # Delete namespace + try: + self.corev1_api.delete_namespace(self.namespace, client.V1DeleteOptions()) + except client.rest.ApiException as e: + print("Got exception while deleting namespace: {}".format(e)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument( + '-c', '--cluster', help="(string) Name of the cluster", type=str, required=True + ) + parser.add_argument( + '-bn', '--build-num', help="(int) CI Build number", type=int, required=True + ) + parser.add_argument( + '-tn', '--test-name', help="(string) Name of the test", type=str, required=True + ) + parser.add_argument( + '-tc', '--test-command', help="(string) command to execute", type=str, required=True + ) + parser.add_argument( + '-i', '--image', help="(string) Container image", type=str, required=True + ) + parser.add_argument( + '-cpu', help="(string) Units of cpu to use", type=str, required=True + ) + parser.add_argument( + '-mem', help="(string) The amount in megabytes", type=str, required=True + ) + parser.add_argument( + '--working-dir', help="(string) The working dir in the container", type=str, required=False, + default='/root/src/docker' + ) + args = parser.parse_args() + + obj = EKSHandler( + args.cluster, args.build_num, args.test_name, args.test_command, + args.image, args.cpu, args.mem, args.working_dir + ) + + if obj.deploy() != 0: + obj.cleanup() + pytest.fail("Failed to deploy") + + obj.print_logs() + + if obj.get_return_status() != 0: + obj.cleanup() + pytest.fail("Failed to run tests") + + obj.cleanup()