From 2b7e65b747c807c52a19054ed4a743c2400c8632 Mon Sep 17 00:00:00 2001 From: Louis Taylor Date: Fri, 7 May 2021 13:07:29 +0100 Subject: [PATCH] Add base IPU dockerfiles (#7252) --- .github/workflows/ci_dockers.yml | 33 ++++++++++ .github/workflows/events-nightly.yml | 46 +++++++++++++ dockers/base-ipu/Dockerfile | 99 ++++++++++++++++++++++++++++ dockers/ipu-ci-runner/Dockerfile | 35 ++++++++++ dockers/ipu-ci-runner/start.sh | 96 +++++++++++++++++++++++++++ 5 files changed, 309 insertions(+) create mode 100644 dockers/base-ipu/Dockerfile create mode 100644 dockers/ipu-ci-runner/Dockerfile create mode 100644 dockers/ipu-ci-runner/start.sh diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 23936a4751..04698f6b8b 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -149,3 +149,36 @@ jobs: file: dockers/nvidia/Dockerfile push: false timeout-minutes: 50 + + build-ipu: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + - python_version: 3.8 + pytorch_version: 1.7 + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Build IPU Docker + uses: docker/build-push-action@v2 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + file: dockers/base-ipu/Dockerfile + push: false + tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + timeout-minutes: 50 + + - name: Build IPU CI runner Docker + uses: docker/build-push-action@v2 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + file: dockers/ipu-ci-runner/Dockerfile + push: false + timeout-minutes: 50 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 987b237c37..70ab0db91a 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -152,3 +152,49 @@ jobs: push: true tags: nvcr.io/pytorchlightning/pytorch_lightning:nvidia timeout-minutes: 55 + + docker-ipu: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + - python_version: 3.8 + pytorch_version: 1.7 + + steps: + - name: Checkout + uses: actions/checkout@v2 + + # https://github.com/docker/setup-buildx-action + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Publish IPU base to Docker Hub + # publish master/release + uses: docker/build-push-action@v2 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + file: dockers/base-ipu/Dockerfile + push: true + tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + timeout-minutes: 55 + + - name: Publish IPU CI runner to Docker Hub + # publish master/release + uses: docker/build-push-action@v2 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + file: dockers/ipu-ci-runner/Dockerfile + push: true + tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }} + timeout-minutes: 55 diff --git a/dockers/base-ipu/Dockerfile b/dockers/base-ipu/Dockerfile new file mode 100644 index 0000000000..13b79b11fb --- /dev/null +++ b/dockers/base-ipu/Dockerfile @@ -0,0 +1,99 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:20.04 + +MAINTAINER PyTorchLightning + +ARG PYTHON_VERSION=3.8 +ARG PYTORCH_VERSION=1.7 +ARG CONDA_VERSION=4.9.2 + +SHELL ["/bin/bash", "-c"] + +# for skipping configurations +ENV \ + DEBIAN_FRONTEND=noninteractive \ + CONDA_ENV=lightning + +RUN apt-get update -qq && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + git \ + jq \ + libomp5 \ + libopenmpi-dev \ + unzip \ + wget \ + && \ +# Install conda and python. +# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ + chmod +x ~/miniconda.sh && \ + ~/miniconda.sh -b && \ + rm ~/miniconda.sh && \ +# Cleaning + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /root/.cache && \ + rm -rf /var/lib/apt/lists/* + +ENV \ + PATH="/root/miniconda3/bin:$PATH" \ + LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" + +COPY environment.yml environment.yml + +RUN conda create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} cudatoolkit=${CUDA_VERSION} -c pytorch -c pytorch-test -c pytorch-nightly && \ + conda init bash && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python>=[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'- pytorch[>=]+[\d\.]+', '# - pytorch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if not any(n in ln for n in ['pytorch>', 'horovod'])])" && \ + cat environment.yml && \ + conda env update --file environment.yml && \ + conda clean -ya && \ + rm environment.yml + +ENV \ + PATH=/root/miniconda3/envs/${CONDA_ENV}/bin:$PATH \ + LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" \ + # if you want this environment to be the default one, uncomment the following line: + CONDA_DEFAULT_ENV=${CONDA_ENV} \ + MKL_THREADING_LAYER=GNU + +COPY ./requirements/extra.txt requirements-extra.txt +COPY ./requirements/test.txt requirements-test.txt +COPY ./requirements/adjust_versions.py requirements_adjust_versions.py + +RUN \ + pip list | grep torch && \ + python -c "import torch; print(torch.__version__)" && \ + python requirements_adjust_versions.py requirements-extra.txt && \ + python -c "fname = 'requirements-extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \ + python -c "fname = 'requirements-extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" && \ + # Install remaining requirements + pip install -r requirements-extra.txt --no-cache-dir && \ + pip install -r requirements-test.txt --no-cache-dir && \ + rm requirements* + +RUN \ + # Show what we have + pip --version && \ + conda info && \ + pip list && \ + python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" diff --git a/dockers/ipu-ci-runner/Dockerfile b/dockers/ipu-ci-runner/Dockerfile new file mode 100644 index 0000000000..89238d7de1 --- /dev/null +++ b/dockers/ipu-ci-runner/Dockerfile @@ -0,0 +1,35 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG PYTHON_VERSION=3.8 +ARG PYTORCH_VERSION=1.7 + +FROM pytorchlightning/pytorch_lightning:base-ipu-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} + +MAINTAINER PyTorchLightning + +RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +WORKDIR /azp + +COPY ./dockers/ipu-ci-runner/start.sh /usr/local/bin/ + +RUN curl -o /usr/local/bin/installdependencies.sh \ + "https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \ + chmod +x /usr/local/bin/installdependencies.sh && \ + chmod +x /usr/local/bin/start.sh && \ + /usr/local/bin/installdependencies.sh + +ENTRYPOINT ["/usr/local/bin/start.sh"] +CMD ["bash"] diff --git a/dockers/ipu-ci-runner/start.sh b/dockers/ipu-ci-runner/start.sh new file mode 100644 index 0000000000..caa452b978 --- /dev/null +++ b/dockers/ipu-ci-runner/start.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +# This is a slightly modified version of the script from +# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker + +set -e + +if [ -z "$AZP_URL" ]; then + echo 1>&2 "error: missing AZP_URL environment variable" + exit 1 +fi + +if [ -z "$AZP_TOKEN_FILE" ]; then + if [ -z "$AZP_TOKEN" ]; then + echo 1>&2 "error: missing AZP_TOKEN environment variable" + exit 1 + fi + + AZP_TOKEN_FILE=/azp/.token + echo -n $AZP_TOKEN > "$AZP_TOKEN_FILE" +fi + +unset AZP_TOKEN + +if [ -n "$AZP_WORK" ]; then + mkdir -p "$AZP_WORK" +fi + +rm -rf /azp/agent +mkdir /azp/agent +cd /azp/agent + +export AGENT_ALLOW_RUNASROOT="1" + +cleanup() { + if [ -e config.sh ]; then + print_header "Cleanup. Removing Azure Pipelines agent..." + + ./config.sh remove --unattended \ + --auth PAT \ + --token $(cat "$AZP_TOKEN_FILE") + fi +} + +print_header() { + lightcyan='\033[1;36m' + nocolor='\033[0m' + echo -e "${lightcyan}$1${nocolor}" +} + +# Let the agent ignore the token env variables +export VSO_AGENT_IGNORE=AZP_TOKEN,AZP_TOKEN_FILE + +print_header "1. Determining matching Azure Pipelines agent..." + +AZP_AGENT_RESPONSE=$(curl -LsS \ + -u user:$(cat "$AZP_TOKEN_FILE") \ + -H 'Accept:application/json;api-version=3.0-preview' \ + "$AZP_URL/_apis/distributedtask/packages/agent?platform=linux-x64") + +if echo "$AZP_AGENT_RESPONSE" | jq . >/dev/null 2>&1; then + AZP_AGENTPACKAGE_URL=$(echo "$AZP_AGENT_RESPONSE" \ + | jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]') +fi + +if [ -z "$AZP_AGENTPACKAGE_URL" -o "$AZP_AGENTPACKAGE_URL" == "null" ]; then + echo 1>&2 "error: could not determine a matching Azure Pipelines agent - check that account '$AZP_URL' is correct and the token is valid for that account" + exit 1 +fi + +print_header "2. Downloading and installing Azure Pipelines agent..." + +curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $! + +source ./env.sh + +print_header "3. Configuring Azure Pipelines agent..." + +./config.sh --unattended \ + --agent "${AZP_AGENT_NAME:-$(hostname)}" \ + --url "$AZP_URL" \ + --auth PAT \ + --token $(cat "$AZP_TOKEN_FILE") \ + --pool "${AZP_POOL:-Default}" \ + --work "${AZP_WORK:-_work}" \ + --replace \ + --acceptTeeEula & wait $! + +print_header "4. Running Azure Pipelines agent..." + +trap 'cleanup; exit 130' INT +trap 'cleanup; exit 143' TERM + +# To be aware of TERM and INT signals call run.sh +# Running it with the --once flag at the end will shut down the agent after the build is executed +./run.sh --once & wait $!