From 9dd04028d5bb17fc98c8f495be8c9264ec6a7518 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <borda@users.noreply.github.com>
Date: Fri, 8 Jan 2021 15:36:49 +0000
Subject: [PATCH] tests for legacy checkpoints (#5223)

* wip

* generate

* clean

* tests

* copy

* download

* download

* download

* download

* download

* download

* download

* download

* download

* download

* download

* flake8

* extend

* aws

* extension

* pull

* pull

* pull

* pull

* pull

* pull

* pull

* try

* try

* try

* got it

* Apply suggestions from code review

(cherry picked from commit 72525f0a8396ae6dce5cf78ddf71e75fbba2dbfc)
---
 .drone.yml                                    |  8 ++
 .github/workflows/ci_test-conda.yml           | 15 ++-
 .github/workflows/ci_test-full.yml            | 13 ++-
 .github/workflows/nightly.yml                 |  2 +-
 .github/workflows/release-pypi.yml            | 50 +++++++++-
 .gitignore                                    |  4 +
 MANIFEST.in                                   |  1 +
 dockers/base-conda/Dockerfile                 |  2 +
 dockers/base-cuda/Dockerfile                  |  2 +
 dockers/tpu-tests/Dockerfile                  |  6 ++
 legacy/checkpoints/.gitkeep                   |  0
 legacy/generate_checkpoints.sh                | 40 ++++++++
 legacy/zero_training.py                       | 92 +++++++++++++++++++
 setup.py                                      |  2 +-
 tests/__init__.py                             |  2 +
 .../checkpointing/test_legacy_checkpoints.py  | 54 +++++++++++
 16 files changed, 286 insertions(+), 7 deletions(-)
 create mode 100644 legacy/checkpoints/.gitkeep
 create mode 100644 legacy/generate_checkpoints.sh
 create mode 100644 legacy/zero_training.py
 create mode 100644 tests/checkpointing/test_legacy_checkpoints.py

diff --git a/.drone.yml b/.drone.yml
index 472861852c..91ccba28a1 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -39,6 +39,14 @@ steps:
     # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0"
     - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed
     - pip list
+    # todo: remove unzip install after new nigtly docker is created
+    - apt-get update -qq
+    - apt-get install -y --no-install-recommends unzip
+    # get legacy checkpoints
+    - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/
+    - unzip -o legacy/checkpoints.zip -d legacy/
+    - ls -l legacy/checkpoints/
+    # testing...
     - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=25 # --flake8
     # Running special tests
     - sh tests/special_tests.sh
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index 15797ff59e..06a98e23c6 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -34,10 +34,21 @@ jobs:
       # todo this probably does not work with docker images, rather cache dockers
       uses: actions/cache@v2
       with:
-        path: Datasets # This path is specific to Ubuntu
-        # Look to see if there is a cache hit for the corresponding requirements file
+        path: Datasets
         key: pl-dataset
 
+    - name: Pull checkpoints from S3
+      # todo: consider adding coma caching, but ATM all models have less then 100KB
+      run: |
+        # todo: remove unzip install after new nigtly docker is created
+        apt-get update -qq
+        apt-get install -y --no-install-recommends unzip
+        # enter legacy and update checkpoints from S3
+        cd legacy
+        curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip
+        unzip -o checkpoints.zip
+        ls -l checkpoints/
+
     - name: Tests
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index c42b9732a8..0d8fb902c6 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -87,6 +87,16 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-
 
+    - name: Pull checkpoints from S3
+      # todo: consider adding some caching, but ATM all models have less then 100KB
+      run: |
+        cd legacy
+        # wget is simpler but does not work on Windows
+        python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')"
+        ls -l .
+        unzip -o checkpoints.zip
+        ls -l checkpoints/
+
     - name: Install dependencies
       env:
         # MAKEFLAGS: "-j2"
@@ -119,8 +129,7 @@ jobs:
     - name: Cache datasets
       uses: actions/cache@v2
       with:
-        path: Datasets # This path is specific to Ubuntu
-        # Look to see if there is a cache hit for the corresponding requirements file
+        path: Datasets
         key: pl-dataset
 
     - name: Tests
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index ce9caf1b34..71227308cd 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -35,7 +35,7 @@ jobs:
         with:
           time: 5m
 
-        # We do this, since failures on test.pypi aren't that bad
+      # We do this, since failures on test.pypi aren't that bad
       - name: Publish to Test PyPI
         uses: pypa/gh-action-pypi-publish@v1.4.1
         with:
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index 3cc3157ffb..b0310c3d36 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -5,7 +5,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
   push:
     branches: [master, "release/*"]  # include release branches like release/1.0.x
   release:
-    types: [created, "release/*"]
+    types: [created]
 
 
 jobs:
@@ -61,3 +61,51 @@ jobs:
       with:
         user: __token__
         password: ${{ secrets.pypi_password }}
+
+    # Note: This uses an internal pip API and may not always work
+    # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
+    - name: Cache pip
+      uses: actions/cache@v2
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
+        restore-keys: ${{ runner.os }}-pip-
+
+    - name: Install dependencies
+      run: |
+        pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
+        pip install virtualenv
+        pip install awscli
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
+        aws-region: us-east-1
+
+    - name: Pull files from S3
+      run: |
+        aws s3 cp --recursive s3://pl-public-data/legacy/checkpoints/ legacy/checkpoints/ #  --acl public-read
+        ls -l legacy/checkpoints/
+
+    - name: Generate checkpoint
+      if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
+      run: |
+        virtualenv vEnv --system-site-packages
+        source vEnv/bin/activate
+        pip install dist/*
+
+        pl_ver=$(python -c "import pytorch_lightning as pl ; print(pl.__version__)" 2>&1)
+        # generate checkpoint to this version
+        bash legacy/generate_checkpoints.sh $pl_ver
+
+        deactivate
+        rm -rf vEnv
+
+    - name: Push files to S3
+      run: |
+        aws s3 sync legacy/checkpoints/ s3://pl-public-data/legacy/checkpoints/
+        cd legacy
+        zip -r checkpoints.zip checkpoints
+        aws s3 cp checkpoints.zip s3://pl-public-data/legacy/ --acl public-read
diff --git a/.gitignore b/.gitignore
index 237dbef370..65ff649c43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ timit_data/
 # C extensions
 *.so
 
+# PyCharm
 .idea/
 
 # Distribution / packaging
@@ -126,11 +127,14 @@ ENV/
 
 # mypy
 .mypy_cache/
+# pytest
+.pytest_cache/
 
 # data
 .data/
 Datasets/
 mnist/
+legacy/checkpoints/
 
 # pl tests
 ml-runs/
diff --git a/MANIFEST.in b/MANIFEST.in
index 450a9ec576..95672548f7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -69,3 +69,4 @@ prune temp*
 prune test*
 prune benchmark*
 prune dockers
+prune legacy
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index 2d1c166c20..27ac96f96e 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -39,7 +39,9 @@ RUN apt-get update -qq && \
         build-essential \
         cmake \
         git \
+        wget \
         curl \
+        unzip \
         ca-certificates \
     && \
 
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index bde54b8da7..d84cba8b4c 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -44,6 +44,8 @@ RUN apt-get update -qq && \
         cmake \
         git \
         wget \
+        curl \
+        unzip \
         ca-certificates \
         software-properties-common \
     && \
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index 4e83bdcb0d..93d6244121 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -23,6 +23,12 @@ MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 COPY ./ ./pytorch-lightning/
 
+# Pull the legacy checkpoints
+RUN cd pytorch-lightning && \
+    wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ && \
+    unzip -o legacy/checkpoints.zip -d legacy/ && \
+    ls -l legacy/checkpoints/
+
 # If using this image for tests, intall more dependencies and don"t delete the source code where the tests live.
 RUN \
     # Install pytorch-lightning at the current PR, plus dependencies.
diff --git a/legacy/checkpoints/.gitkeep b/legacy/checkpoints/.gitkeep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/legacy/generate_checkpoints.sh b/legacy/generate_checkpoints.sh
new file mode 100644
index 0000000000..c9f4dabff4
--- /dev/null
+++ b/legacy/generate_checkpoints.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Sample call:
+#  bash generate_checkpoints.sh 1.0.2 1.0.3 1.0.4
+
+LEGACY_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+
+echo $LEGACY_PATH
+# install some PT version here so it does not need to reinstalled for each env
+pip install virtualenv "torch==1.5" --quiet --no-cache-dir
+
+ENV_PATH="$LEGACY_PATH/vEnv"
+
+# iterate over all arguments assuming that each argument is version
+for ver in "$@"
+do
+	echo "processing version: $ver"
+	# mkdir "$LEGACY_PATH/$ver"
+
+  # create local env
+  echo $ENV_PATH
+  virtualenv $ENV_PATH --system-site-packages
+  # activate and install PL version
+  source "$ENV_PATH/bin/activate"
+  pip install "pytorch_lightning==$ver" --quiet -U --no-cache-dir
+
+  python --version
+  pip --version
+  pip list | grep torch
+
+  python "$LEGACY_PATH/zero_training.py"
+  cp "$LEGACY_PATH/zero_training.py" ${LEGACY_PATH}/checkpoints/${ver}
+
+  mv ${LEGACY_PATH}/checkpoints/${ver}/lightning_logs/version_0/checkpoints/*.ckpt ${LEGACY_PATH}/checkpoints/${ver}/
+  rm -rf ${LEGACY_PATH}/checkpoints/${ver}/lightning_logs
+
+  deactivate
+  # clear env
+  rm -rf $ENV_PATH
+
+done
diff --git a/legacy/zero_training.py b/legacy/zero_training.py
new file mode 100644
index 0000000000..4e4952a3bb
--- /dev/null
+++ b/legacy/zero_training.py
@@ -0,0 +1,92 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+from torch.utils.data import Dataset
+
+import pytorch_lightning as pl
+
+PATH_LEGACY = os.path.dirname(__file__)
+
+
+class RandomDataset(Dataset):
+    def __init__(self, size, length: int = 100):
+        self.len = length
+        self.data = torch.randn(length, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+class DummyModel(pl.LightningModule):
+
+    def __init__(self):
+        super().__init__()
+        self.layer = torch.nn.Linear(32, 2)
+
+    def forward(self, x):
+        return self.layer(x)
+
+    def _loss(self, batch, prediction):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
+
+    def _step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self._loss(batch, output)
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+    def validation_step(self, batch, batch_idx):
+        self._step(batch, batch_idx)
+
+    def test_step(self, batch, batch_idx):
+        self._step(batch, batch_idx)
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+        return [optimizer], [lr_scheduler]
+
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    def test_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+
+def main_train(dir_path, max_epochs: int = 5):
+
+    trainer = pl.Trainer(
+        default_root_dir=dir_path,
+        checkpoint_callback=True,
+        max_epochs=max_epochs,
+    )
+
+    model = DummyModel()
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    path_dir = os.path.join(PATH_LEGACY, 'checkpoints', str(pl.__version__))
+    main_train(path_dir)
diff --git a/setup.py b/setup.py
index 961540fb96..2993c96c23 100755
--- a/setup.py
+++ b/setup.py
@@ -69,7 +69,7 @@ setup(
     url=pytorch_lightning.__homepage__,
     download_url='https://github.com/PyTorchLightning/pytorch-lightning',
     license=pytorch_lightning.__license__,
-    packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks']),
+    packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks', 'legacy', 'legacy/*']),
 
     long_description=_load_readme_description(PATH_ROOT),
     long_description_content_type='text/markdown',
diff --git a/tests/__init__.py b/tests/__init__.py
index e0ec83a2ef..57feda6280 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -18,6 +18,8 @@ import numpy as np
 _TEST_ROOT = os.path.dirname(__file__)
 _PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
 _TEMP_PATH = os.path.join(_PROJECT_ROOT, 'test_temp')
+DATASETS_PATH = os.path.join(_PROJECT_ROOT, 'Datasets')
+LEGACY_PATH = os.path.join(_PROJECT_ROOT, 'legacy')
 
 # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
 if _PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
new file mode 100644
index 0000000000..cb9fe443a3
--- /dev/null
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -0,0 +1,54 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os
+import sys
+
+import pytest
+
+from pytorch_lightning import Trainer
+from tests import LEGACY_PATH
+
+LEGACY_CHECKPOINTS_PATH = os.path.join(LEGACY_PATH, 'checkpoints')
+CHECKPOINT_EXTENSION = ".ckpt"
+
+
+# todo: add more legacy checkpoints :]
+@pytest.mark.parametrize("pl_version", [
+    "0.10.0", "1.0.0", "1.0.1", "1.0.2", "1.0.3", "1.0.4", "1.0.5", "1.0.6", "1.0.7", "1.0.8"
+])
+def test_resume_legacy_checkpoints(tmpdir, pl_version):
+    path_dir = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)
+
+    # todo: make this as mock, so it is cleaner...
+    orig_sys_paths = list(sys.path)
+    sys.path.insert(0, path_dir)
+    from zero_training import DummyModel
+
+    path_ckpts = sorted(glob.glob(os.path.join(path_dir, f'*{CHECKPOINT_EXTENSION}')))
+    assert path_ckpts, 'No checkpoints found in folder "%s"' % path_dir
+    path_ckpt = path_ckpts[-1]
+
+    model = DummyModel.load_from_checkpoint(path_ckpt)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=6)
+    result = trainer.fit(model)
+    assert result
+
+    # todo
+    # model = DummyModel()
+    # trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, resume_from_checkpoint=path_ckpt)
+    # result = trainer.fit(model)
+    # assert result
+
+    sys.path = orig_sys_paths