Setup basic CI flow (#38)

Adds automated running of unit, integration tests (and optionally longer running tests)
2026-02-14 12:55:51 +01:00 · 2018-10-24 18:27:58 -07:00
parent 2cc6abc3c4
commit 16b3e99f37
10 changed files with 408 additions and 129 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -0,0 +1,136 @@
 aliases:
  - &executor_prep
    docker:
      - image: circleci/python:3.7.0-stretch
    working_directory: ~/repo
  - &remote_docker
    # ensure layers of constructed docker containers are cached for reuse between jobs.
    setup_remote_docker:
      docker_layer_caching: true
  - &restore_cache
    restore_cache:
      keys:
        - v1-dependencies-{{ checksum "requirements.txt" }}
        # fallback to using the latest cache if no exact match is found
        - v1-dependencies-
  - &save_cache
    save_cache:
      paths:
        - ./venv
      key: v1-dependencies-{{ checksum "requirements.txt" }}
  - &aws_prep
    run:
      name: Prepare aws cli
      command: |
        sudo pip install awscli pytest kubernetes==8.0.0b1
        export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode`
        export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode`
        $(aws ecr get-login --no-include-email --region us-west-2)
        sudo curl -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.10.3/2018-07-26/bin/linux/amd64/aws-iam-authenticator
        sudo chmod a+x /usr/local/bin/aws-iam-authenticator
        aws eks update-kubeconfig --name coach-aws-cicd
 version: 2
 jobs:
  build:
    <<: *executor_prep
    steps:
      - checkout
      - *remote_docker
      - *restore_cache
      - *aws_prep
      - run:
          name: Build and push container
          command: |
            REGISTRY=316971102342.dkr.ecr.us-west-2.amazonaws.com
            TAG=$(git describe --tags --always --dirty)
            docker pull ${REGISTRY}/coach-base:${MASTER_BRANCH}
            docker build --cache-from ${REGISTRY}/coach-base:${MASTER_BRANCH} -t ${REGISTRY}/coach-base:${TAG} -f docker/Dockerfile.base .
            docker push ${REGISTRY}/coach-base:${TAG}
            docker tag ${REGISTRY}/coach-base:${TAG} coach-base:master
            docker build -t ${REGISTRY}/coach:${TAG} -f docker/Dockerfile .
            docker push ${REGISTRY}/coach:${TAG}
          no_output_timeout: 30m
  unit_tests:
    <<: *executor_prep
    steps:
      - checkout
      - *remote_docker
      - *restore_cache
      - *aws_prep
      - run:
          name: run unit tests
          command: |
            export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode`
            export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode`
            python3 rl_coach/tests/test_eks.py  -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn unit-test -tc 'make unit_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096
  integration_tests:
    <<: *executor_prep
    steps:
      - checkout
      - *remote_docker
      - *restore_cache
      - *aws_prep
      - run:
          name: run integration tests
          command: |
            export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode`
            export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode`
            python3 rl_coach/tests/test_eks.py  -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn integration-test -tc 'make integration_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096
  golden_tests:
    <<: *executor_prep
    steps:
      - checkout
      - *remote_docker
      - *restore_cache
      - *aws_prep
      - run:
          name: run golden tests
          command: |
            export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode`
            export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode`
            python3 rl_coach/tests/test_eks.py  -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test -tc 'make golden_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096
  trace_tests:
    <<: *executor_prep
    steps:
      - checkout
      - *remote_docker
      - *restore_cache
      - *aws_prep
      - run:
          name: run trace tests
          command: |
            export AWS_ACCESS_KEY_ID=`echo ${AWS_ACCESS_KEY_ID} | base64 --decode`
            export AWS_SECRET_ACCESS_KEY=`echo ${AWS_SECRET_ACCESS_KEY} | base64 --decode`
            python3 rl_coach/tests/test_eks.py  -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test -tc 'make trace_tests_without_docker' -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096
 workflows:
  version: 2
  build_and_test:
    jobs:
      - build
      - unit_tests:
          requires:
            - build
      - integration_tests:
          requires:
            - build
      - e2e_approval:
          type: approval
          requires:
            - build
      - golden_tests:
          requires:
            - e2e_approval
      - trace_tests:
          requires:
            - e2e_approval
--- a/.gitignore
+++ b/.gitignore
@@ -20,12 +20,12 @@ rl_coach.egg*
 contrib
 test_log_*
 dist
 .DS_Store
 datasets
 .cache
 .pytest_cache
 core
 trace_test*
 .DS_Store
 *.swp
 *.swo
 .cache/
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,120 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM coach-base:master
 # https://github.com/NVIDIA/nvidia-docker/issues/619
 RUN rm /etc/apt/sources.list.d/cuda.list
 RUN apt-get update  && \
    apt-get upgrade -y && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 RUN apt-get update && \
    apt-get install -y python-pip && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 RUN pip install pip --upgrade
 WORKDIR /root
 ################################
 # Install apt-get Requirements #
 ################################
 # General
 RUN apt-get update && \
    apt-get install -y python3-pip cmake zlib1g-dev python3-tk python-opencv && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # Boost libraries
 RUN apt-get update && \
    apt-get install -y libboost-all-dev && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # Scipy requirements
 RUN apt-get update && \
    apt-get install -y libblas-dev liblapack-dev libatlas-base-dev gfortran && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # Pygame requirements
 RUN apt-get update && \
    apt-get install -y libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 RUN apt-get update && \
    apt-get install -y libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # Dashboard
 RUN apt-get update && \
    apt-get install -y dpkg-dev build-essential python3.5-dev libjpeg-dev  libtiff-dev libsdl1.2-dev libnotify-dev \
    freeglut3 freeglut3-dev libsm-dev libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev \
    libwebkitgtk-3.0-dev libgstreamer-plugins-base1.0-dev && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # Gym
 RUN apt-get update && \
    apt-get install -y libav-tools libsdl2-dev swig cmake && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # Mujoco_py
 RUN apt-get update && \
    apt-get install -y curl libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev software-properties-common && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 # ViZDoom
 RUN apt-get update && \
    apt-get install -y build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
    nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
    libopenal-dev timidity libwildmidi-dev unzip wget && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 ############################
 # Install Pip Requirements #
 ############################
 RUN pip3 install --upgrade pip
 RUN pip3 install pytest
 RUN pip3 install pytest-xdist
 # initial installation of coach, so that the docker build won't install everything from scratch
 RUN pip3 install rl_coach>=0.10.0
 # install additional environments
 RUN pip3 install gym[atari]==0.10.5
 RUN pip3 install mujoco_py==1.50.1.56
 RUN pip3 install vizdoom==1.1.6
 # FROM ubuntu:16.04
 #
 # RUN apt-get update \
 #     && apt-get install -y \
 #     python3-pip cmake zlib1g-dev python3-tk python-opencv \
 #     libboost-all-dev \
 #     libblas-dev liblapack-dev libatlas-base-dev gfortran \
 #     libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev \
 #     libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev \
 #     dpkg-dev build-essential python3.5-dev libjpeg-dev  libtiff-dev \
 #     libsdl1.2-dev libnotify-dev freeglut3 freeglut3-dev libsm-dev \
 #     libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev \
 #     libwebkitgtk-3.0-dev libgstreamer-plugins-base1.0-dev \
 #     libav-tools libsdl2-dev swig
 #
 # # installing python dependencies
 # RUN pip3 install --upgrade pip
 RUN apt-get update && apt-get install -y wget zip
 RUN mkdir -p ~/.mujoco \
    && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \
    && unzip mujoco.zip -d ~/.mujoco \
    && rm mujoco.zip
 ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH
 RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \
    && chmod +x /usr/local/bin/patchelf
 RUN mkdir /root/src
 COPY setup.py /root/src/.
--- a/docker/Dockerfile.base
+++ b/docker/Dockerfile.base
@@ -0,0 +1,63 @@
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 # https://github.com/NVIDIA/nvidia-docker/issues/619
 RUN rm /etc/apt/sources.list.d/cuda.list
 RUN apt-get update  && \
    apt-get upgrade -y && \
    apt-get clean autoclean && \
    apt-get autoremove -y && apt-get update && \
    apt-get install -y python-pip && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 RUN pip install pip --upgrade
 WORKDIR /root
 ################################
 # Install apt-get Requirements #
 ################################
 # General
 RUN apt-get update && \
    apt-get install -y python3-pip cmake zlib1g-dev python3-tk python-opencv \
    # Boost libraries
    libboost-all-dev \
    # Scipy requirements
    libblas-dev liblapack-dev libatlas-base-dev gfortran \
    # Pygame requirements
    libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev \
    libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev \
    # Dashboard
    dpkg-dev build-essential python3.5-dev libjpeg-dev  libtiff-dev libsdl1.2-dev libnotify-dev \
    freeglut3 freeglut3-dev libsm-dev libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev \
    libwebkitgtk-3.0-dev libgstreamer-plugins-base1.0-dev \
    # Gym
    libav-tools libsdl2-dev swig cmake \
    # Mujoco_py
    curl libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev software-properties-common \
    # ViZDoom
    build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
    nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
    libopenal-dev timidity libwildmidi-dev unzip wget && \
    apt-get clean autoclean && \
    apt-get autoremove -y
 ############################
 # Install Pip Requirements #
 ############################
 RUN pip3 install --upgrade pip
 RUN pip3 install pytest
 RUN pip3 install pytest-xdist
 # initial installation of coach, so that the docker build won't install everything from scratch
 RUN pip3 install rl_coach>=0.10.0 && pip3 install gym[atari]==0.10.5 && \
    pip3 install mujoco_py==1.50.1.56 && pip3 install vizdoom==1.1.6
 RUN mkdir -p ~/.mujoco \
    && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \
    && unzip mujoco.zip -d ~/.mujoco \
    && rm mujoco.zip
 # COPY ./mjkey.txt /root/.mujoco/
 ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH
 RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \
    && chmod +x /usr/local/bin/patchelf
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -20,6 +20,11 @@ RUN_ARGUMENTS+=--rm
 RUN_ARGUMENTS+=--net host
 RUN_ARGUMENTS+=-v /tmp/checkpoint:/checkpoint
 UNIT_TESTS=python3 -m pytest rl_coach/tests -m unit_test
 INTEGRATION_TESTS=python3 -m pytest rl_coach/tests -m integration_test -n auto --tb=short
 GOLDEN_TESTS=python3 -m pytest rl_coach/tests -m golden_test -n auto
 TRACE_TESTS=python3 rl_coach/tests/trace_tests.py -prl
 CONTEXT = $(realpath ..)
 ifndef DOCKER
@@ -35,17 +40,16 @@ shell: build
 	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} /bin/bash
 unit_tests: build
-	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 -m pytest rl_coach/tests -m unit_test -n 8
+	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${UNIT_TESTS} -n 8
 integration_tests: build
-	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 -m pytest rl_coach/tests -m integration_test -n auto --tb=short
+	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${INTEGRATION_TESTS}
 golden_tests: build
-	# ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/tests/golden_tests.py
+	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${GOLDEN_TESTS}
 	time ${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 -m pytest rl_coach/tests -m golden_test -n auto
 trace_tests: build
-	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} python3 rl_coach/tests/trace_tests.py -prl
+	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE} ${TRACE_TESTS}
 run: build
 	${DOCKER} run ${RUN_ARGUMENTS} -it ${IMAGE}
@@ -73,3 +77,15 @@ kubernetes: stop_kubernetes
 push: build
 	${DOCKER} tag ${IMAGE} ${REGISTRY}${IMAGE}
 	${DOCKER} push ${REGISTRY}${IMAGE}
 unit_tests_without_docker:
 	cd .. && ${UNIT_TESTS}
 integration_tests_without_docker:
 	cd .. && ${INTEGRATION_TESTS}
 golden_tests_without_docker:
 	cd .. && ${GOLDEN_TESTS}
 trace_tests_without_docker:
 	cd .. && ${TRACE_TESTS}
--- a/docker/docker_entrypoint.sh
+++ b/docker/docker_entrypoint.sh
@@ -16,6 +16,4 @@ set -e
 export VIZDOOM_ROOT=`pip show vizdoom 2>/dev/null | awk '/Location/{print $2}'`/vizdoom
-cd /root/src/
+bash -c "$@"
 exec "$@"
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,7 @@ gym==0.10.5
 bokeh==0.13.0
 futures==3.1.1
 wxPython==4.0.1
-kubernetes==7.0.0
+kubernetes==8.0.0b1
 redis==2.10.6
 minio==4.0.5
 pytest==3.8.2
--- a/rl_coach/graph_managers/graph_manager.py
+++ b/rl_coach/graph_managers/graph_manager.py
@@ -364,7 +364,6 @@ class GraphManager(object):
            if self.agent_params.memory.memory_backend_params.run_type == "worker":
                data_store = get_data_store(self.data_store_params)
                data_store.load_from_store()
        # perform several steps of playing
        count_end = self.current_step_counter + steps
        while self.current_step_counter < count_end:
--- a/rl_coach/presets/Doom_Basic_DQN.py
+++ b/rl_coach/presets/Doom_Basic_DQN.py
@@ -36,7 +36,6 @@ agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
 ###############
 env_params = DoomEnvironmentParameters(level='basic')
 ########
 # Test #
 ########
--- a/rl_coach/tests/test_eks.py
+++ b/rl_coach/tests/test_eks.py
@@ -0,0 +1,183 @@
 import argparse
 import pytest
 import time
 from kubernetes import client, config
 class EKSHandler():
    def __init__(self, cluster, build_num, test_name, test_command, image, cpu, memory, working_dir):
        self.cluster = cluster
        self.build_num = build_num
        self.test_name = test_name
        self.test_command = test_command
        self.image = image
        self.cpu = cpu
        self.memory = memory
        config.load_kube_config()
        self.namespace = '{}-{}'.format(test_name, build_num)
        self.corev1_api = client.CoreV1Api()
        self.create_namespace()
        self.working_dir = working_dir
    def create_namespace(self):
        namespace = client.V1Namespace(
            api_version='v1',
            kind="Namespace",
            metadata=client.V1ObjectMeta(name=self.namespace)
        )
        try:
            self.corev1_api.create_namespace(namespace)
        except client.rest.ApiException as e:
            raise RuntimeError("Failed to create namesapce. Got exception: {}".format(e))
    def deploy(self):
        container = client.V1Container(
            name=self.test_name,
            image=self.image,
            args=[self.test_command],
            image_pull_policy='Always',
            working_dir=self.working_dir,
            stdin=True,
            tty=True
        )
        pod_spec = client.V1PodSpec(
            containers=[container],
            restart_policy='Never'
        )
        pod = client.V1Pod(
            api_version="v1",
            kind="Pod",
            metadata=client.V1ObjectMeta(name=self.test_name),
            spec=pod_spec
        )
        try:
            self.corev1_api.create_namespaced_pod(self.namespace, pod)
        except client.rest.ApiException as e:
            print("Got exception: {} while creating a pod".format(e))
            return 1
        return 0
    def print_logs(self):
        while True:
            time.sleep(10)
            # Try to tail the pod logs
            try:
                for line in self.corev1_api.read_namespaced_pod_log(
                    self.test_name, self.namespace, follow=True,
                    _preload_content=False
                ):
                    print(line.decode('utf-8'), flush=True, end='')
            except client.rest.ApiException as e:
                pass
            try:
                pod = self.corev1_api.read_namespaced_pod(self.test_name, self.namespace)
            except client.rest.ApiException as e:
                continue
            if not hasattr(pod, 'status') or not pod.status:
                continue
            if not hasattr(pod.status, 'container_statuses') or not pod.status.container_statuses:
                continue
            for container_status in pod.status.container_statuses:
                if container_status.state.waiting is not None:
                    if container_status.state.waiting.reason == 'Error' or \
                       container_status.state.waiting.reason == 'CrashLoopBackOff' or \
                       container_status.state.waiting.reason == 'ImagePullBackOff' or \
                       container_status.state.waiting.reason == 'ErrImagePull':
                        return
                if container_status.state.terminated is not None:
                    return
    def get_return_status(self):
        # This part will get executed if the pod is one of the following phases: not ready, failed or terminated.
        # Check if the pod has errored out, else just try again.
        # Get the pod
        try:
            pod = self.corev1_api.read_namespaced_pod(self.test_name, self.namespace)
        except client.rest.ApiException as e:
            return 1
        if not hasattr(pod, 'status') or not pod.status:
            return 0
        if not hasattr(pod.status, 'container_statuses') or not pod.status.container_statuses:
            return 0
        for container_status in pod.status.container_statuses:
            if container_status.state.waiting is not None:
                if container_status.state.waiting.reason == 'Error' or \
                   container_status.state.waiting.reason == 'CrashLoopBackOff' or \
                   container_status.state.waiting.reason == 'ImagePullBackOff' or \
                   container_status.state.waiting.reason == 'ErrImagePull':
                    return 1
            if container_status.state.terminated is not None:
                return container_status.state.terminated.exit_code
    def cleanup(self):
        # Delete pod
        try:
            self.corev1_api.delete_namespaced_pod(self.test_name, self.namespace, client.V1DeleteOptions())
        except client.rest.ApiException as e:
            print("Got exception while deleting pod: {}".format(e))
        # Delete namespace
        try:
            self.corev1_api.delete_namespace(self.namespace, client.V1DeleteOptions())
        except client.rest.ApiException as e:
            print("Got exception while deleting namespace: {}".format(e))
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cluster', help="(string) Name of the cluster", type=str, required=True
    )
    parser.add_argument(
        '-bn', '--build-num', help="(int) CI Build number", type=int, required=True
    )
    parser.add_argument(
        '-tn', '--test-name', help="(string) Name of the test", type=str, required=True
    )
    parser.add_argument(
        '-tc', '--test-command', help="(string) command to execute", type=str, required=True
    )
    parser.add_argument(
        '-i', '--image', help="(string) Container image", type=str, required=True
    )
    parser.add_argument(
        '-cpu', help="(string) Units of cpu to use", type=str, required=True
    )
    parser.add_argument(
        '-mem', help="(string) The amount in megabytes", type=str, required=True
    )
    parser.add_argument(
        '--working-dir', help="(string) The working dir in the container", type=str, required=False,
        default='/root/src/docker'
    )
    args = parser.parse_args()
    obj = EKSHandler(
        args.cluster, args.build_num, args.test_name, args.test_command,
        args.image, args.cpu, args.mem, args.working_dir
    )
    if obj.deploy() != 0:
        obj.cleanup()
        pytest.fail("Failed to deploy")
    obj.print_logs()
    if obj.get_return_status() != 0:
        obj.cleanup()
        pytest.fail("Failed to run tests")
    obj.cleanup()