diff --git a/.drone.yml b/.drone.yml index 71532d96ed..67f0c38758 100644 --- a/.drone.yml +++ b/.drone.yml @@ -6,12 +6,13 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:devel-pt1.4 + image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5 environment: SLURM_LOCALID: 0 CODECOV_TOKEN: from_secret: codecov_token + MKL_THREADING_LAYER: GNU HOROVOD_GPU_ALLREDUCE: NCCL HOROVOD_GPU_BROADCAST: NCCL HOROVOD_WITH_PYTORCH: 1 @@ -33,10 +34,10 @@ steps: - nvidia-smi #- bash ./tests/install_AMP.sh - apt-get update && apt-get install -y cmake - - pip install -r ./requirements/base.txt --user -q - - pip install -r ./requirements/devel.txt --user -q + - pip install -r ./requirements/base.txt --user -q --upgrade-strategy only-if-needed + - pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed #- pip install -r ./requirements/docs.txt --user -q - - pip install -r ./requirements/examples.txt --user -q + - pip install -r ./requirements/examples.txt --user -q --upgrade-strategy only-if-needed - pip list - python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')" - coverage run --source pytorch_lightning -m py.test pytorch_lightning tests -v --durations=25 # --flake8 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index d596ea7643..5cb1aec47a 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -82,3 +82,29 @@ jobs: build_args: PYTHON_VERSION=${{ matrix.python_version }} tags: "XLA-extras-py${{ matrix.python_version }}" timeout-minutes: 25 + + build-cuda: + runs-on: ubuntu-20.04 + strategy: + matrix: + python_version: [3.7] + pytorch_version: [1.3, 1.4, 1.5, 1.6.0] + steps: + - name: Checkout + uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Publish Master to Docker + # publish master + uses: docker/build-push-action@v1.1.0 + if: github.event_name == 'push' + with: + repository: pytorchlightning/pytorch_lightning + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + dockerfile: dockers/tpu-extras/Dockerfile + build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }} + tags: "cuda-extras-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + timeout-minutes: 40 diff --git a/dockers/README.md b/dockers/README.md index b03c3d7a57..7b3063e00f 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -12,7 +12,7 @@ or with specific arguments ```bash git clone docker image build \ - -t pytorch-lightning:py38 \ + -t pytorch-lightning:py3.8 \ -f dockers/conda/Dockerfile \ --build-arg PYTHON_VERSION=3.8 \ --build-arg PYTORCH_VERSION=1.4 \ diff --git a/dockers/cuda-extras/Dockerfile b/dockers/cuda-extras/Dockerfile new file mode 100644 index 0000000000..c4bc5cfb64 --- /dev/null +++ b/dockers/cuda-extras/Dockerfile @@ -0,0 +1,40 @@ +# Existing images: +# --build-arg TORCH_VERSION=1.6.0 --build-arg CUDA_VERSION=10.1 +# --build-arg TORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.1 +# --build-arg TORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1 +# --build-arg TORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1 +# --build-arg TORCH_VERSION=1.2 --build-arg CUDA_VERSION=10.0 +# --build-arg TORCH_VERSION=1.1.0 --build-arg CUDA_VERSION=10.0 --build-arg CUDNN_VERSION=7.5 + +ARG TORCH_VERSION=1.6 +ARG CUDA_VERSION=10.1 +ARG CUDNN_VERSION=7 + +FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel + +ENV HOROVOD_GPU_ALLREDUCE=NCCL +ENV HOROVOD_GPU_BROADCAST=NCCL +ENV HOROVOD_WITH_PYTORCH=1 +ENV HOROVOD_WITHOUT_TENSORFLOW=1 +ENV HOROVOD_WITHOUT_MXNET=1 +ENV HOROVOD_WITH_GLOO=1 +ENV HOROVOD_WITHOUT_MPI=1 +ENV PATH="$PATH:/root/.local/bin" +ENV MAKEFLAGS="-j$(nproc)" + +COPY ./tests/install_AMP.sh install_AMP.sh +COPY ./requirements/base.txt requirements.txt +COPY ./requirements/extra.txt requirements-extra.txt +COPY ./requirements/test.txt requirements-tests.txt +COPY ./requirements/examples.txt requirements-examples.txt + +RUN apt-get update && apt-get install -y cmake && \ + # Install AMP + bash install_AMP.sh && \ + pip install -r requirements.txt && \ + # HOROVOD_BUILD_ARCH_FLAGS="-mfma" && \ + pip install -r requirements-extra.txt && \ + pip install -r requirements-examples.txt && \ + pip install -r requirements-tests.txt && \ + rm requirements* && \ + pip list diff --git a/tests/Dockerfile b/tests/Dockerfile deleted file mode 100644 index 65c75c1ba3..0000000000 --- a/tests/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -ARG TORCH_VERSION=1.4 -ARG CUDA_VERSION=10.1 - -FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn7-devel - -ENV HOROVOD_GPU_ALLREDUCE: NCCL -ENV HOROVOD_GPU_BROADCAST: NCCL -ENV HOROVOD_WITH_PYTORCH: 1 -ENV HOROVOD_WITHOUT_TENSORFLOW: 1 -ENV HOROVOD_WITHOUT_MXNET: 1 -ENV HOROVOD_WITH_GLOO: 1 -ENV HOROVOD_WITHOUT_MPI: 1 -ENV PATH: "$PATH:/root/.local/bin" -ENV MAKEFLAGS: "-j$(nproc)" - -COPY ./tests/install_AMP.sh install_AMP.sh -COPY ./requirements/base.txt requirements.txt -COPY ./requirements/extra.txt requirements-extra.txt -COPY ./requirements/test.txt requirements-tests.txt - -# Install AMP -RUN apt-get update && apt-get install -y cmake && \ - bash install_AMP.sh && \ - pip install -r requirements.txt --user && \ - pip install -r requirements-extra.txt --user && \ - pip install -r requirements-tests.txt --user && \ - pip list diff --git a/tests/README.md b/tests/README.md index 6286e8b9e8..ccd62301aa 100644 --- a/tests/README.md +++ b/tests/README.md @@ -54,7 +54,7 @@ coverage xml You can build it on your own, note it takes lots of time, be prepared. ```bash git clone -docker image build -t pytorch_lightning:devel-torch1.4 -f tests/Dockerfile --build-arg TORCH_VERSION=1.4 . +docker image build -t pytorch_lightning:devel-torch1.4 -f dockers/cuda-extras/Dockerfile --build-arg TORCH_VERSION=1.4 . ``` To build other versions, select different Dockerfile. ```bash