From 3772601cd6872cde006aab9284e103e857955457 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 31 Jul 2020 20:50:06 +0200 Subject: [PATCH] update CI testing with pip upgrade (#2380) * try pt1.5 * cpu * upgrade * tpu * user * [blocked by #2380] freeze GPU PT 1.4 (#2780) * freeze * user --- .drone.yml | 2 +- .github/workflows/ci-test-base.yml | 2 +- .github/workflows/ci-testing.yml | 13 ++----- README.md | 2 + tests/models/test_tpu.py | 60 ++++++------------------------ 5 files changed, 20 insertions(+), 59 deletions(-) diff --git a/.drone.yml b/.drone.yml index 67f0c38758..edb6f48bbb 100644 --- a/.drone.yml +++ b/.drone.yml @@ -6,7 +6,7 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5 + image: pytorchlightning/pytorch_lightning:devel-pt1.4 environment: SLURM_LOCALID: 0 diff --git a/.github/workflows/ci-test-base.yml b/.github/workflows/ci-test-base.yml index 7def5ca4b1..855a9831fd 100644 --- a/.github/workflows/ci-test-base.yml +++ b/.github/workflows/ci-test-base.yml @@ -57,7 +57,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade --user pip - pip install --requirement ./requirements/base.txt --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade-strategy only-if-needed + pip install --requirement ./requirements/base.txt --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade pip install --requirement ./requirements/test.txt --quiet --upgrade-strategy only-if-needed # pip install tox coverage python --version diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index d846950a37..0ed2db4755 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -42,7 +42,7 @@ jobs: - name: Update Pip run: | - pip install -U -q "pip>=20.1" # needed for get pip cacher folder + pip install --quiet "pip>=20.1" --upgrade --user # needed for get pip cacher folder # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS @@ -54,14 +54,9 @@ jobs: - name: Setup Windows if: runner.os == 'windows' run: | + # remove Horovod from requirements python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" - # TODO: remove after https://github.com/pytorch/pytorch/issues/32186 is resolved - #- name: Setup Windows on Latest - # if: runner.os == 'windows' && matrix.requires == 'latest' - # run: | - # python -c "fname = 'requirements/base.txt' ; req = open(fname).read().replace('torch>=1.3', 'torch<1.5') ; open(fname, 'w').write(req)" - # versions <= 1.3 may have issues on mac with some BLAS ops due to missing mkl (https://github.com/pytorch/pytorch/issues/18996) - name: Setup MacOS Minimal if: runner.os == 'macOS' && matrix.requires == 'minimal' @@ -92,8 +87,8 @@ jobs: - name: Install dependencies run: | - pip install --requirement requirements/base.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade-strategy only-if-needed - # pip install -q "PyYAML>=5.3.1" # needed for installing dependencues + # python -m pip install --upgrade --user pip + pip install --requirement requirements/base.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade HOROVOD_BUILD_ARCH_FLAGS="-mfma" pip install --requirement ./requirements/devel.txt --quiet --upgrade-strategy "only-if-needed" python --version pip --version diff --git a/README.md b/README.md index f0f022e600..2b21327999 100644 --- a/README.md +++ b/README.md @@ -437,6 +437,8 @@ You can also install any past release `0.X.Y` from this repository: pip install https://github.com/PytorchLightning/pytorch-lightning/archive/0.X.Y.zip --upgrade ``` +--- + ## Lightning team #### Leads diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index ccc68cb59b..ecbeb821a3 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -51,42 +51,24 @@ def test_model_tpu_cores_1(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) +@pytest.mark.parametrize('tpu_core', [1, 5]) @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_tpu_index_1(tmpdir): +def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, distributed_backend='tpu', - tpu_cores=[1], + tpu_cores=[tpu_core], limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:1' - - -@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -@pl_multi_process_test -def test_model_tpu_index_5(tmpdir): - """Make sure model trains on TPU.""" - trainer_options = dict( - default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - distributed_backend='tpu', - tpu_cores=[5], - limit_train_batches=0.4, - limit_val_batches=0.4, - ) - - model = EvalModelTemplate() - tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:5' + assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @@ -131,24 +113,27 @@ def test_model_16bit_tpu_cores_1(tmpdir): assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" +@pytest.mark.parametrize('tpu_core', [1, 5]) @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_16bit_tpu_index_1(tmpdir): +def test_model_16bit_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, precision=16, progress_bar_refresh_rate=0, + train_percent_check=0.4, + val_percent_check=0.2, max_epochs=1, distributed_backend='tpu', - tpu_cores=[1], + tpu_cores=[tpu_core], limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:1' + assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}' assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" @@ -177,27 +162,7 @@ def test_model_16bit_tpu_cores_8(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_16bit_tpu_index_5(tmpdir): - """Test if distributed TPU core training works""" - model = EvalModelTemplate() - trainer = Trainer( - default_root_dir=tmpdir, - precision=16, - max_epochs=1, - train_percent_check=0.4, - val_percent_check=0.2, - distributed_backend='tpu', - tpu_cores=[5], - ) - trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:5' - assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" - - -@pytest.mark.parametrize('tpu_core', [1, 5]) -@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -@pl_multi_process_test -def test_train_on_single_tpu(tmpdir, tpu_core): +def test_model_tpu_early_stop(tmpdir): """Test if single TPU core training works""" model = EvalModelTemplate() trainer = Trainer( @@ -208,10 +173,9 @@ def test_train_on_single_tpu(tmpdir, tpu_core): limit_train_batches=10, limit_val_batches=10, distributed_backend='tpu', - tpu_cores=[tpu_core], + tpu_cores=1, ) trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:5' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")