From 13baad56e48e6b0b370dc30433e319b54244d8cc Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Tue, 25 Oct 2022 11:29:15 -0700 Subject: [PATCH] Add support for custom cloud compute configurations for Flows (#14831) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * use more recent lightning cloud launcher * allow LightningApp to use custom cloud compute for flows * feedback from adrian * adjust other cloud tests * update * update * update commens * Update src/lightning_app/core/app.py Co-authored-by: Sherin Thomas * Close profiler when `StopIteration` is raised (#14945) * Find last checkpoints on restart (#14907) Co-authored-by: Carlos MocholĂ­ * Remove unused gcsfs dependency (#14962) * Update hpu mixed precision link (#14974) Signed-off-by: Jerome * Bump version of fsspec (#14975) fsspec verbump * Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta * Trainer: fix support for non-distributed PyTorch (#14971) * Trainer: fix non-distributed use * Update CHANGELOG * fixes typing errors in rich_progress.py (#14963) * revert default cloud compute rename * allow LightningApp to use custom cloud compute for flows * feedback from adrian * update * resolve merge with master conflict * remove preemptible * update CHANGELOG * add basic flow cloud compute documentation * fix docs build * add missing symlink * try to fix sphinx * another attempt for docs * fix new test Signed-off-by: Jerome Co-authored-by: thomas chaton Co-authored-by: Sherin Thomas Co-authored-by: Ziyad Sheebaelhamd <47150407+ziyadsheeba@users.noreply.github.com> Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Carlos MocholĂ­ Co-authored-by: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Co-authored-by: awaelchli Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta Co-authored-by: Adam J. Stewart Co-authored-by: DP <10988155+donlapark@users.noreply.github.com> --- .../lightning_app/compute_content.rst | 40 +++++++++++++++ .../core_api/lightning_app/index.rst | 8 +++ .../lightning_app/compute_content.rst | 1 + src/lightning_app/CHANGELOG.md | 5 +- src/lightning_app/core/app.py | 4 ++ src/lightning_app/runners/cloud.py | 7 +++ tests/tests_app/runners/test_cloud.py | 50 +++++++++++++++++++ 7 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 docs/source-app/core_api/lightning_app/compute_content.rst create mode 120000 docs/source-lit/core_api/lightning_app/compute_content.rst diff --git a/docs/source-app/core_api/lightning_app/compute_content.rst b/docs/source-app/core_api/lightning_app/compute_content.rst new file mode 100644 index 0000000000..753bcacf33 --- /dev/null +++ b/docs/source-app/core_api/lightning_app/compute_content.rst @@ -0,0 +1,40 @@ +:orphan: + +*************************** +Customize my Flow resources +*************************** + +In the cloud, you can simply configure which machine to run on by passing +a :class:`~lightning_app.utilities.packaging.cloud_compute.CloudCompute` to your work ``__init__`` method: + +.. code-block:: python + + import lightning as L + + # Run on a small, shared CPU machine. This is the default for every LightningFlow. + app = L.LightningApp(L.Flow(), flow_cloud_compute=L.CloudCompute()) + + +Here is the full list of supported machine names: + +.. list-table:: Hardware by Accelerator Type + :widths: 25 25 25 + :header-rows: 1 + + * - Name + - # of CPUs + - Memory + * - flow-lite + - 0.3 + - 4 GB + +The up-to-date prices for these instances can be found `here `_. + +---- + +************ +CloudCompute +************ + +.. autoclass:: lightning_app.utilities.packaging.cloud_compute.CloudCompute + :noindex: diff --git a/docs/source-app/core_api/lightning_app/index.rst b/docs/source-app/core_api/lightning_app/index.rst index 1cdb4360db..eeae294adc 100644 --- a/docs/source-app/core_api/lightning_app/index.rst +++ b/docs/source-app/core_api/lightning_app/index.rst @@ -39,6 +39,14 @@ Peek under the hood :height: 180 :tag: Intermediate +.. displayitem:: + :header: Customize Flow compute resources + :description: Learn more about Flow customizations. + :col_css: col-md-4 + :button_link: compute_content.html + :height: 180 + :tag: Intermediate + .. displayitem:: :header: Dynamically create, execute and stop Work :description: Learn more about components creation. diff --git a/docs/source-lit/core_api/lightning_app/compute_content.rst b/docs/source-lit/core_api/lightning_app/compute_content.rst new file mode 120000 index 0000000000..1e2e13cf91 --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/compute_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/compute_content.rst \ No newline at end of file diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 534342ccf8..6efc2a5802 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -13,10 +13,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a `--secret` option to CLI to allow binding secrets to app environment variables when running in the cloud ([#14612](https://github.com/Lightning-AI/lightning/pull/14612)) - Added support for running the works without cloud compute in the default container ([#14819](https://github.com/Lightning-AI/lightning/pull/14819)) - Added an HTTPQueue as an optional replacement for the default redis queue ([#14978](https://github.com/Lightning-AI/lightning/pull/14978) -- Added authentication to HTTP queue ([#15202](https://github.com/Lightning-AI/lightning/pull/15202)) +- Added support for configuring flow cloud compute ([#14831](https://github.com/Lightning-AI/lightning/pull/14831)) - Added support for adding descriptions to commands either through a docstring or the `DESCRIPTION` attribute ([#15193](https://github.com/Lightning-AI/lightning/pull/15193) - Added a try / catch mechanism around request processing to avoid killing the flow ([#15187](https://github.com/Lightning-AI/lightning/pull/15187) -- Added a Database Component ([#14995](https://github.com/Lightning-AI/lightning/pull/14995) +- Added an Database Component ([#14995](https://github.com/Lightning-AI/lightning/pull/14995) +- Added authentication to HTTP queue ([#15202](https://github.com/Lightning-AI/lightning/pull/15202)) - Added support to pass a `LightningWork` to the `LightningApp` ([#15215](https://github.com/Lightning-AI/lightning/pull/15215) - Added support getting CLI help for connected apps even if the app isn't running ([#15196](https://github.com/Lightning-AI/lightning/pull/15196) - Added support for adding requirements to commands and installing them when missing when running an app command ([#15198](https://github.com/Lightning-AI/lightning/pull/15198) diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index b2c84fd1c8..dbc43a6222 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -11,6 +11,7 @@ from typing import Dict, List, Optional, Tuple, TYPE_CHECKING, Union from deepdiff import DeepDiff, Delta from lightning_utilities.core.apply_func import apply_to_collection +import lightning_app from lightning_app import _console from lightning_app.api.request_types import APIRequest, CommandRequest, DeltaRequest from lightning_app.core.constants import ( @@ -50,6 +51,7 @@ class LightningApp: def __init__( self, root: Union["LightningFlow", "LightningWork"], + flow_cloud_compute: Optional["lightning_app.CloudCompute"] = None, debug: bool = False, info: frontend.AppInfo = None, root_path: str = "", @@ -67,6 +69,7 @@ class LightningApp: Arguments: root: The root ``LightningFlow`` or ``LightningWork`` component, that defines all the app's nested components, running infinitely. It must define a `run()` method that the app can call. + flow_cloud_compute: The default Cloud Compute used for flow, Rest API and frontend's. debug: Whether to activate the Lightning Logger debug mode. This can be helpful when reporting bugs on Lightning repo. info: Provide additional info about the app which will be used to update html title, @@ -100,6 +103,7 @@ class LightningApp: _validate_root_flow(root) self._root = root + self.flow_cloud_compute = flow_cloud_compute or lightning_app.CloudCompute() # queues definition. self.delta_queue: Optional[BaseQueue] = None diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index fe780c25f6..56e62e5bab 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -36,6 +36,7 @@ from lightning_cloud.openapi import ( V1QueueServerType, V1SourceType, V1UserRequestedComputeConfig, + V1UserRequestedFlowComputeConfig, V1Work, ) from lightning_cloud.openapi.rest import ApiException @@ -206,6 +207,11 @@ class CloudRuntime(Runtime): flow_servers=frontend_specs, desired_state=V1LightningappInstanceState.RUNNING, env=v1_env_vars, + user_requested_flow_compute_config=V1UserRequestedFlowComputeConfig( + name=self.app.flow_cloud_compute.name, + shm_size=self.app.flow_cloud_compute.shm_size, + preemptible=False, + ), ) # if requirements file at the root of the repository is present, @@ -242,6 +248,7 @@ class CloudRuntime(Runtime): works=[V1Work(name=work_req.name, spec=work_req.spec) for work_req in work_reqs], local_source=True, dependency_cache_key=app_spec.dependency_cache_key, + user_requested_flow_compute_config=app_spec.user_requested_flow_compute_config, ) if ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER: diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index b99efd9483..7aa18b4b24 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -29,6 +29,7 @@ from lightning_cloud.openapi import ( V1QueueServerType, V1SourceType, V1UserRequestedComputeConfig, + V1UserRequestedFlowComputeConfig, V1Work, ) @@ -37,6 +38,7 @@ from lightning_app.runners import backends, cloud from lightning_app.storage import Drive, Mount from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash +from lightning_app.utilities.packaging.cloud_compute import CloudCompute class MyWork(LightningWork): @@ -66,6 +68,47 @@ class WorkWithTwoDrives(LightningWork): class TestAppCreationClient: """Testing the calls made using GridRestClient to create the app.""" + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) + def test_run_with_custom_flow_compute_config(self, monkeypatch): + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="test-project", project_id="test-project-id")] + ) + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[]) + ) + cloud_backend = mock.MagicMock() + cloud_backend.client = mock_client + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + app = mock.MagicMock() + app.flows = [] + app.frontend = {} + app.flow_cloud_compute = CloudCompute(name="t2.medium") + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + cloud_runtime._check_uploaded_folder = mock.MagicMock() + + monkeypatch.setattr(Path, "is_file", lambda *args, **kwargs: False) + monkeypatch.setattr(cloud, "Path", Path) + cloud_runtime.dispatch() + body = Body8( + app_entrypoint_file=mock.ANY, + enable_app_server=True, + flow_servers=[], + image_spec=None, + works=[], + local_source=True, + dependency_cache_key=mock.ANY, + user_requested_flow_compute_config=V1UserRequestedFlowComputeConfig( + name="t2.medium", + preemptible=False, + shm_size=0, + ), + ) + cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + project_id="test-project-id", app_id=mock.ANY, body=body + ) + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) def test_run_on_byoc_cluster(self, monkeypatch): mock_client = mock.MagicMock() @@ -100,6 +143,7 @@ class TestAppCreationClient: works=[], local_source=True, dependency_cache_key=mock.ANY, + user_requested_flow_compute_config=mock.ANY, ) cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( project_id="default-project-id", app_id=mock.ANY, body=body @@ -142,6 +186,7 @@ class TestAppCreationClient: works=[], local_source=True, dependency_cache_key=mock.ANY, + user_requested_flow_compute_config=mock.ANY, ) cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( project_id="test-project-id", app_id=mock.ANY, body=body @@ -264,6 +309,7 @@ class TestAppCreationClient: enable_app_server=True, flow_servers=[], dependency_cache_key=get_hash(requirements_file), + user_requested_flow_compute_config=mock.ANY, image_spec=Gridv1ImageSpec( dependency_file_info=V1DependencyFileInfo( package_manager=V1PackageManager.PIP, path="requirements.txt" @@ -431,6 +477,7 @@ class TestAppCreationClient: enable_app_server=True, flow_servers=[], dependency_cache_key=get_hash(requirements_file), + user_requested_flow_compute_config=mock.ANY, image_spec=Gridv1ImageSpec( dependency_file_info=V1DependencyFileInfo( package_manager=V1PackageManager.PIP, path="requirements.txt" @@ -590,6 +637,7 @@ class TestAppCreationClient: enable_app_server=True, flow_servers=[], dependency_cache_key=get_hash(requirements_file), + user_requested_flow_compute_config=mock.ANY, image_spec=Gridv1ImageSpec( dependency_file_info=V1DependencyFileInfo( package_manager=V1PackageManager.PIP, path="requirements.txt" @@ -623,6 +671,7 @@ class TestAppCreationClient: enable_app_server=True, flow_servers=[], dependency_cache_key=get_hash(requirements_file), + user_requested_flow_compute_config=mock.ANY, image_spec=Gridv1ImageSpec( dependency_file_info=V1DependencyFileInfo( package_manager=V1PackageManager.PIP, path="requirements.txt" @@ -756,6 +805,7 @@ class TestAppCreationClient: package_manager=V1PackageManager.PIP, path="requirements.txt" ) ), + user_requested_flow_compute_config=mock.ANY, works=[ V1Work( name="test-work",