From 977df6ed31169dba4c78e61da3eb4c8519311c37 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 6 Jul 2020 20:21:36 +0200 Subject: [PATCH] Docker: building XLA base image (#2494) * refactor * add TPU base * wip * builds * typo * extras * simple * unzip * rename --- .drone.yml | 2 +- .github/workflows/docker-builds.yml | 31 ++++++++- .github/workflows/tpu-testing.yml | 4 +- {docker => dockers}/README.md | 4 +- {docker => dockers/conda}/Dockerfile | 0 dockers/tpu-extras/Dockerfile | 65 +++++++++++++++++++ {docker/tpu => dockers/tpu-tests}/Dockerfile | 0 .../tpu-tests}/docker-entrypoint.sh | 0 .../tpu-tests}/tpu_test_cases.jsonnet | 0 tests/README.md | 6 +- 10 files changed, 101 insertions(+), 11 deletions(-) rename {docker => dockers}/README.md (82%) rename {docker => dockers/conda}/Dockerfile (100%) create mode 100644 dockers/tpu-extras/Dockerfile rename {docker/tpu => dockers/tpu-tests}/Dockerfile (100%) rename {docker/tpu => dockers/tpu-tests}/docker-entrypoint.sh (100%) rename {docker/tpu => dockers/tpu-tests}/tpu_test_cases.jsonnet (100%) diff --git a/.drone.yml b/.drone.yml index 8a022ca2eb..607c6d0d0c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -6,7 +6,7 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:devel-pt_1_4 + image: pytorchlightning/pytorch_lightning:devel-pt1.4 environment: SLURM_LOCALID: 0 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index b587bde4cc..bcce902846 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -10,7 +10,7 @@ on: - created jobs: - build: + build-Conda: runs-on: ubuntu-20.04 strategy: matrix: @@ -35,7 +35,7 @@ jobs: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - dockerfile: docker/Dockerfile + dockerfile: dockers/conda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }} tags: "nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" timeout-minutes: 40 @@ -53,7 +53,32 @@ jobs: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - dockerfile: docker/Dockerfile + dockerfile: dockers/conda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }} tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" timeout-minutes: 40 + + build-XLA: + runs-on: ubuntu-20.04 + strategy: + matrix: + python_version: [3.6, 3.7] + steps: + - name: Checkout + uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Publish Master to Docker + # publish master + uses: docker/build-push-action@v1.1.0 + if: github.event_name == 'push' + with: + repository: pytorchlightning/pytorch_lightning + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + dockerfile: dockers/tpu-extras/Dockerfile + build_args: PYTHON_VERSION=${{ matrix.python_version }} + tags: "XLA-extras-py${{ matrix.python_version }}" + timeout-minutes: 25 diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index 686346d98f..1433575a96 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -60,7 +60,7 @@ jobs: shell: bash - name: Build and Push Docker Image run: | - cd docker/tpu + cd dockers/tpu-tests docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" . docker push "$IMAGE:$GITHUB_RUN_ID" shell: bash @@ -77,7 +77,7 @@ jobs: - name: Deploy the job on the kubernetes cluster run: |- - job_name=$(jsonnet -J ml-testing-accelerators/ docker/tpu/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \ + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \ job_name=${job_name#job.batch/} && \ job_name=${job_name% created} && \ echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \ diff --git a/docker/README.md b/dockers/README.md similarity index 82% rename from docker/README.md rename to dockers/README.md index 72d924cef2..b03c3d7a57 100644 --- a/docker/README.md +++ b/dockers/README.md @@ -4,7 +4,7 @@ You can build it on your own, note it takes lots of time, be prepared. ```bash git clone -docker image build -t pytorch-lightning:latest -f docker/Dockerfile . +docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile . ``` or with specific arguments @@ -13,7 +13,7 @@ or with specific arguments git clone docker image build \ -t pytorch-lightning:py38 \ - -f docker/Dockerfile \ + -f dockers/conda/Dockerfile \ --build-arg PYTHON_VERSION=3.8 \ --build-arg PYTORCH_VERSION=1.4 \ . diff --git a/docker/Dockerfile b/dockers/conda/Dockerfile similarity index 100% rename from docker/Dockerfile rename to dockers/conda/Dockerfile diff --git a/dockers/tpu-extras/Dockerfile b/dockers/tpu-extras/Dockerfile new file mode 100644 index 0000000000..ef5354dff6 --- /dev/null +++ b/dockers/tpu-extras/Dockerfile @@ -0,0 +1,65 @@ +FROM google/cloud-sdk:slim + +# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below. +ARG PYTHON_VERSION=3.7 + +SHELL ["/bin/bash", "-c"] + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-dev \ + $( [ ${PYTHON_VERSION%%.*} -ge 3 ] && echo "python${PYTHON_VERSION%%.*}-distutils" ) \ + build-essential \ + cmake \ + wget \ + unzip \ + ca-certificates \ + libomp5 \ + && \ + +# Install python dependencies + wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ + python${PYTHON_VERSION} get-pip.py && \ + rm get-pip.py && \ + +# Set the default python and install PIP packages + update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ + +# Install Pytorch XLA + py_version=${PYTHON_VERSION/./} && \ + # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m + gsutil cp "gs://tpu-pytorch/wheels/torch-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" . && \ + gsutil cp "gs://tpu-pytorch/wheels/torch_xla-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" . && \ + gsutil cp "gs://tpu-pytorch/wheels/torchvision-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" . && \ + pip install "torch-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" && \ + pip install "torch_xla-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" && \ + pip install "torchvision-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" && \ + rm "torch-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" && \ + rm "torch_xla-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" && \ + rm "torchvision-nightly-cp${py_version}-cp${py_version}m-linux_x86_64.whl" && \ + pip install mkl && \ + +# Cleaning + apt-get autoremove -y && \ + apt-get clean + +# Install pytorch-lightning dependencies. +RUN \ +# Get package + wget https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip --progress=bar:force:noscroll && \ + unzip master.zip && \ + rm master.zip && \ + +# Install PL dependencies + cd pytorch-lightning-master && \ + pip install --requirement ./requirements/base.txt && \ + # Drop Horovod + python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + pip install --requirement ./requirements/extra.txt && \ + cd .. && \ + rm -rf pytorch-lightning-master + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" + +RUN python -c "import torch; print(torch.__version__)" diff --git a/docker/tpu/Dockerfile b/dockers/tpu-tests/Dockerfile similarity index 100% rename from docker/tpu/Dockerfile rename to dockers/tpu-tests/Dockerfile diff --git a/docker/tpu/docker-entrypoint.sh b/dockers/tpu-tests/docker-entrypoint.sh similarity index 100% rename from docker/tpu/docker-entrypoint.sh rename to dockers/tpu-tests/docker-entrypoint.sh diff --git a/docker/tpu/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet similarity index 100% rename from docker/tpu/tpu_test_cases.jsonnet rename to dockers/tpu-tests/tpu_test_cases.jsonnet diff --git a/tests/README.md b/tests/README.md index 4857da1724..6286e8b9e8 100644 --- a/tests/README.md +++ b/tests/README.md @@ -54,11 +54,11 @@ coverage xml You can build it on your own, note it takes lots of time, be prepared. ```bash git clone -docker image build -t pytorch_lightning:devel-pt_1_4 -f tests/Dockerfile --build-arg TORCH_VERSION=1.4 . +docker image build -t pytorch_lightning:devel-torch1.4 -f tests/Dockerfile --build-arg TORCH_VERSION=1.4 . ``` To build other versions, select different Dockerfile. ```bash docker image list -docker run --rm -it pytorch_lightning:devel-pt_1_4 bash -docker image rm pytorch_lightning:devel-pt_1_4 +docker run --rm -it pytorch_lightning:devel-torch1.4 bash +docker image rm pytorch_lightning:devel-torch1.4 ```