diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index b6d225ffa3..27e542de80 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -148,11 +148,13 @@ jobs: # It also e2e tests running on cloud without installing dependencies. - bash: | git clone https://github.com/Lightning-AI/lightning-quick-start examples/app/quick-start - condition: eq(variables['name'], 'quick_start') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['name'], 'quick_start')) displayName: 'Clone Quick start Repo' - bash: | git clone https://github.com/Lightning-AI/lightning-template-react examples/app/template_react_ui - condition: eq(variables['name'], 'template_react_ui') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['name'], 'template_react_ui')) displayName: 'Clone Template React UI Repo' # Replace imports to use `lightning` instead of `lightning_app` since we install lightning only ATM diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml index 422194d831..1859403af8 100644 --- a/.azure/gpu-benchmarks.yml +++ b/.azure/gpu-benchmarks.yml @@ -103,7 +103,8 @@ jobs: - bash: bash run_standalone_tasks.sh workingDirectory: tests/parity_fabric - condition: eq(variables['PACKAGE_NAME'], 'fabric') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric')) env: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: fabric standalone tasks' diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 0b4e27e3e4..c32b92b8db 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -130,7 +130,8 @@ jobs: - bash: python -m pytest lightning_fabric workingDirectory: src - condition: eq(variables['PACKAGE_NAME'], 'fabric') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric')) displayName: 'Testing: Fabric doctests' - bash: | @@ -141,7 +142,8 @@ jobs: python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \ --source_import="lightning.fabric" \ --target_import="lightning_fabric" - condition: eq(variables['PACKAGE_NAME'], 'fabric') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric')) displayName: 'Adjust tests & examples' - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50 diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 3b45218d83..e205588bee 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -133,11 +133,13 @@ jobs: displayName: 'Bump to nightly' - bash: pip uninstall -y lightning - condition: eq(variables['PACKAGE_NAME'], 'pytorch') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'pytorch')) # Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it displayName: 'Drop LAI from extensions' - bash: pip uninstall -y pytorch-lightning - condition: eq(variables['PACKAGE_NAME'], 'lightning') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'lightning')) displayName: 'Drop PL for LAI' - bash: | @@ -149,7 +151,8 @@ jobs: - bash: python -m pytest pytorch_lightning workingDirectory: src - condition: eq(variables['PACKAGE_NAME'], 'pytorch') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'pytorch')) displayName: 'Testing: PyTorch doctests' - bash: | @@ -159,7 +162,8 @@ jobs: python .actions/assistant.py copy_replace_imports --source_dir="./examples/pytorch/basics" \ --source_import="lightning.fabric,lightning.pytorch" \ --target_import="lightning_fabric,pytorch_lightning" - condition: eq(variables['PACKAGE_NAME'], 'pytorch') + # without succeeded this could run even if the job has already failed + condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'pytorch')) displayName: 'Adjust tests & examples' - bash: | diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index f6f5ade517..c278552aef 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -143,6 +143,7 @@ subprojects: - "build-cuda (3.9, 1.12, 11.7.1)" - "build-cuda (3.9, 1.13, 12.0.1)" - "build-cuda (3.10, 2.0, 12.0.1)" + - "build-cuda (3.10, 2.0, 11.7.1)" #- "build-NGC" - "build-pl (3.9, 1.11, 11.3.1)" - "build-pl (3.9, 1.12, 11.7.1)" diff --git a/.github/workflows/ci-dockers.yml b/.github/workflows/ci-dockers.yml index fbfa203f58..d982539e6d 100644 --- a/.github/workflows/ci-dockers.yml +++ b/.github/workflows/ci-dockers.yml @@ -74,6 +74,8 @@ jobs: - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.7.1"} - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "12.0.1"} - {python_version: "3.10", pytorch_version: "2.0", cuda_version: "12.0.1"} + # these are used in Azure GPU CI + - {python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.7.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 525420c642..fe8737cd33 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -112,8 +112,7 @@ jobs: run: | python -m pip install -q pip -U extra=$(python -c "print({'lightning': 'fabric-'}.get('${{ matrix.pkg-name }}', ''))") - pip install -e ".[${extra}test]" "pytest-timeout" -U -f ${TORCH_URL} ${TORCH_PREINSTALL} -f ${PYPI_CACHE_DIR} --prefer-binary - pip install -r requirements/fabric/strategies.txt -f ${PYPI_CACHE_DIR} --prefer-binary + pip install -e ".[${extra}test,${extra}strategies]" "pytest-timeout" -U -f ${TORCH_URL} ${TORCH_PREINSTALL} -f ${PYPI_CACHE_DIR} --prefer-binary pip list - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 55b372be40..827f5a417c 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -116,7 +116,7 @@ jobs: run: | python -m pip install -q pip -U extra=$(python -c "print({'lightning': 'pytorch-'}.get('${{ matrix.pkg-name }}', ''))") - pip install ".[${extra}extra,${extra}test]" -U \ + pip install ".[${extra}extra,${extra}test,${extra}strategies]" -U \ "pytest-timeout" -r requirements/_integrations/accelerators.txt \ -f ${TORCH_URL} ${TORCH_PREINSTALL} -f ${PYPI_CACHE_DIR} --prefer-binary pip list diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 73c6af9def..7d25cb1bd2 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -115,7 +115,7 @@ jobs: env: JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.runtime }}-${{ env.SHA }} run: | - gcloud compute tpus tpu-vm create "$JOB_NAME" --accelerator-type=v4-8 --version="tpu-vm-v4-pt-$XLA_VER" + gcloud compute tpus tpu-vm create "$JOB_NAME" --accelerator-type=v4-8 --version="tpu-vm-v4-pt-$XLA_VER" --preemptible - name: Cancel job if: steps.tpu-create.outcome != 'success' diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 5eedfac596..9ce6376a41 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -4,6 +4,6 @@ numpy >=1.17.2, <1.25.1 torch >=1.11.0, <2.1.0 fsspec[http]>2021.06.0, <2023.5.0 -packaging >=17.1, <=23.0 +packaging >=20.0, <=23.0 typing-extensions >=4.0.0, <=4.4.0 lightning-utilities >=0.8.0, <0.10.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index c4b7240b70..c205f614ca 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -7,6 +7,6 @@ tqdm >=4.57.0, <4.66.0 PyYAML >=5.4, <=6.0 fsspec[http] >2021.06.0, <2023.5.0 torchmetrics >=0.7.0, <1.1.0 # needed for using fixed compare_version -packaging >=17.1, <=23.0 +packaging >=20.0, <=23.0 typing-extensions >=4.0.0, <=4.4.0 lightning-utilities >=0.8.0, <0.10.0 diff --git a/src/lightning/fabric/accelerators/cuda.py b/src/lightning/fabric/accelerators/cuda.py index 8ecb7a8991..2f19529597 100644 --- a/src/lightning/fabric/accelerators/cuda.py +++ b/src/lightning/fabric/accelerators/cuda.py @@ -360,7 +360,8 @@ def _check_cuda_matmul_precision(device: torch.device) -> None: def _clear_cuda_memory() -> None: - if _TORCH_GREATER_EQUAL_2_0: + # strangely, the attribute function be undefined when torch.compile is used + if _TORCH_GREATER_EQUAL_2_0 and hasattr(torch._C, "_cuda_clearCublasWorkspaces"): # https://github.com/pytorch/pytorch/issues/95668 torch._C._cuda_clearCublasWorkspaces() torch.cuda.empty_cache() diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index cc0e287fee..3a9b0e3ad4 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -347,7 +347,14 @@ def test_setup_with_orig_params_and_multiple_param_groups(): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, dynamo=True) @mock.patch.dict(os.environ, {}) -@pytest.mark.parametrize("compile_after_setup", [False, True]) +@pytest.mark.parametrize( + "compile_after_setup", + [ + False, + # https://github.com/pytorch/pytorch/issues/97811 + pytest.param(True, marks=RunIf(min_python="3.9")), + ], +) def test_compile(compile_after_setup): """Test that the model can be compiled before and after the model is wrapped in FSDP.""" model = BoringModel() diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 8fddc54a3c..fc6762b373 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -155,7 +155,7 @@ def mock_mps_count(monkeypatch, n: int) -> None: # torch doesn't allow creation of mps devices on older versions monkeypatch.setattr("torch.device", MpsDeviceMock) - monkeypatch.setattr(lightning.fabric.accelerators.mps, "_get_all_available_mps_gpus", lambda: list(range(n))) + monkeypatch.setattr(lightning.fabric.accelerators.mps, "_get_all_available_mps_gpus", lambda: [0] if n > 0 else []) monkeypatch.setattr(lightning.fabric.accelerators.mps.MPSAccelerator, "is_available", lambda *_: n > 0) @@ -169,16 +169,6 @@ def mps_count_1(monkeypatch): mock_mps_count(monkeypatch, 1) -@pytest.fixture() -def mps_count_2(monkeypatch): - mock_mps_count(monkeypatch, 2) - - -@pytest.fixture() -def mps_count_4(monkeypatch): - mock_mps_count(monkeypatch, 4) - - def mock_xla_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None: monkeypatch.setattr(lightning.pytorch.strategies.xla, "_XLA_AVAILABLE", value) monkeypatch.setattr(lightning.pytorch.strategies.single_xla, "_XLA_AVAILABLE", value) diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 7fe31e132a..60aa07994c 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -781,6 +781,7 @@ def test_connector_defaults_match_trainer_defaults(): @RunIf(min_cuda_gpus=1) # trigger this test on our GPU pipeline, because we don't install the package on the CPU suite +@pytest.mark.xfail(raises=ImportError, reason="Not updated to latest API") @pytest.mark.skipif(not package_available("lightning_colossalai"), reason="Requires Colossal AI Strategy") def test_colossalai_external_strategy(monkeypatch): with mock.patch( @@ -795,6 +796,7 @@ def test_colossalai_external_strategy(monkeypatch): @RunIf(min_cuda_gpus=1) # trigger this test on our GPU pipeline, because we don't install the package on the CPU suite +@pytest.mark.xfail(raises=ImportError, reason="Not updated to latest API") @pytest.mark.skipif(not package_available("lightning_bagua"), reason="Requires Bagua Strategy") def test_bagua_external_strategy(monkeypatch): with mock.patch(