From a95ef5a4ac4422d79adbc5061039e15eae5cf280 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Thu, 9 Jul 2020 06:46:07 -0400
Subject: [PATCH] remove parameterize from TPU tests (#2561)

* added base tests for tpu

* added base tests for tpu

* added base tests for tpu

* added base tests for tpu

* added base tests for tpu
---
 tests/base/develop_pipelines.py |   2 +-
 tests/models/test_tpu.py        | 130 +++++++++++++++++++++++++++-----
 2 files changed, 113 insertions(+), 19 deletions(-)

diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py
index 489e2cefe9..ba698e82c8 100644
--- a/tests/base/develop_pipelines.py
+++ b/tests/base/develop_pipelines.py
@@ -50,7 +50,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi
     result = trainer.fit(model)
 
     # correct result and ok accuracy
-    assert result == 1, 'amp + ddp model failed to complete'
+    assert result == 1, 'trainer failed'
 
     # test model loading
     pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path)
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 5ec2e7e7d9..138fe0a415 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -7,6 +7,8 @@ from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
 import tests.base.develop_pipelines as tpipes
+from tests.base.datasets import TrialMNIST
+from torch.utils.data import DataLoader
 
 try:
     import torch_xla
@@ -21,14 +23,13 @@ else:
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pytest.mark.parametrize("tpu_cores", [1, [1], 8])
-def test_base_tpu_model(tmpdir, tpu_cores):
+def test_base_tpu_model_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
-        tpu_cores=tpu_cores,
+        tpu_cores=1,
         limit_train_batches=0.4,
         limit_val_batches=0.4
     )
@@ -38,23 +39,104 @@ def test_base_tpu_model(tmpdir, tpu_cores):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pytest.mark.parametrize("tpu_cores", [1, [1], 8])
-def test_base_tpu_16bit_model(tmpdir, tpu_cores):
+def test_base_tpu_model_idx_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
-        precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
-        tpu_cores=tpu_cores,
+        tpu_cores=[1],
+        limit_train_batches=0.4,
+        limit_val_batches=0.4
+    )
+
+    model = EvalModelTemplate()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+
+
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+def test_base_tpu_model_8(tmpdir):
+    """Make sure model trains on TPU."""
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=8,
         limit_train_batches=0.4,
         limit_val_batches=0.4
     )
 
     model = EvalModelTemplate()
 
-    tpipes.run_model_test(trainer_options, model, on_gpu=False)
+    # 8 cores needs a big dataset
+    def long_train_loader():
+        dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32)
+        return dataset
+    model.train_dataloader = long_train_loader
+    model.val_dataloader = long_train_loader
 
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+
+
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+def test_base_tpu_16bit_model_core_1(tmpdir):
+    """Make sure model trains on TPU."""
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        precision=16,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.4
+    )
+
+    model = EvalModelTemplate()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False)
+    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
+
+
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+def test_base_tpu_16bit_model_idx_core(tmpdir):
+    """Make sure model trains on TPU."""
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        precision=16,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=[1],
+        limit_train_batches=0.4,
+        limit_val_batches=0.4
+    )
+
+    model = EvalModelTemplate()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False)
+    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
+
+
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+def test_base_tpu_16bit_model_8_cores(tmpdir):
+    """Make sure model trains on TPU."""
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        precision=16,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=8,
+        limit_train_batches=0.4,
+        limit_val_batches=0.4
+    )
+
+    model = EvalModelTemplate()
+
+    # 8 cores needs a big dataset
+    def long_train_loader():
+        dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32)
+        return dataset
+    model.train_dataloader = long_train_loader
+    model.val_dataloader = long_train_loader
+
+    tpipes.run_model_test(trainer_options, model, on_gpu=False)
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
@@ -80,11 +162,24 @@ def test_early_stop_checkpoints_on_tpu(tmpdir, tpu_cores, expected_device):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pytest.mark.parametrize(['tpu_cores', 'expected_device'], [
-    pytest.param([1], 'xla:1'),
-    pytest.param([8], 'xla:8'),
-])
-def test_single_tpu_core_model(tmpdir, tpu_cores, expected_device):
+def test_early_stop_checkpoints_on_tpu(tmpdir):
+    """Test if single TPU core training works"""
+    model = EvalModelTemplate()
+    trainer = Trainer(
+        early_stop_callback=True,
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=50,
+        limit_train_batches=10,
+        limit_val_batches=10,
+        tpu_cores=1,
+    )
+    trainer.fit(model)
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
+
+
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+def test_single_tpu_core_model(tmpdir):
     """Test if single TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
@@ -93,15 +188,14 @@ def test_single_tpu_core_model(tmpdir, tpu_cores, expected_device):
         max_epochs=1,
         train_percent_check=0.1,
         val_percent_check=0.1,
-        tpu_cores=tpu_cores,
+        tpu_cores=8,
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == expected_device
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
 
 
-@pytest.mark.parametrize("tpu_cores", [1, 8])
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_multi_core_tpu_model(tmpdir, tpu_cores):
+def test_multi_core_tpu_model(tmpdir):
     """Test if distributed TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
@@ -109,7 +203,7 @@ def test_multi_core_tpu_model(tmpdir, tpu_cores):
         max_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
-        tpu_cores=tpu_cores,
+        tpu_cores=[1, 8],
     )
     trainer.fit(model)
     assert trainer.tpu_id is None