From abae4c903bfc50c659f4a70ba66e2235cee8d6cd Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 28 Feb 2024 14:35:53 +0100 Subject: [PATCH] Update Lightning AI multi-node guide (Trainer) (#19530) * update * update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * configure_model --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source-pytorch/clouds/cluster.rst | 22 +- docs/source-pytorch/clouds/lightning_ai.rst | 192 ++++++++++++++++++ docs/source-pytorch/common/index.rst | 4 +- docs/source-pytorch/common_usecases.rst | 4 +- docs/source-pytorch/levels/intermediate.rst | 4 +- .../levels/intermediate_level_14.rst | 23 ++- 6 files changed, 227 insertions(+), 22 deletions(-) create mode 100644 docs/source-pytorch/clouds/lightning_ai.rst diff --git a/docs/source-pytorch/clouds/cluster.rst b/docs/source-pytorch/clouds/cluster.rst index 59a252a43b..f4775ceaaa 100644 --- a/docs/source-pytorch/clouds/cluster.rst +++ b/docs/source-pytorch/clouds/cluster.rst @@ -1,6 +1,6 @@ -######################### -Run on an on-prem cluster -######################### +########################### +Run on a multi-node cluster +########################### .. raw:: html @@ -8,14 +8,20 @@ Run on an on-prem cluster
-.. Add callout items below this line +.. displayitem:: + :header: Run single or multi-node on Lightning Studios + :description: The easiest way to scale models in the cloud. No infrastructure setup required. + :col_css: col-md-6 + :button_link: lightning_ai.html + :height: 160 + :tag: basic .. displayitem:: :header: Run on an on-prem cluster :description: Learn to train models on a general compute cluster. :col_css: col-md-6 :button_link: cluster_intermediate_1.html - :height: 150 + :height: 160 :tag: intermediate .. displayitem:: @@ -23,7 +29,7 @@ Run on an on-prem cluster :description: Run models on a cluster with torch distributed. :col_css: col-md-6 :button_link: cluster_intermediate_2.html - :height: 150 + :height: 160 :tag: intermediate .. displayitem:: @@ -31,7 +37,7 @@ Run on an on-prem cluster :description: Run models on a SLURM-managed cluster :col_css: col-md-6 :button_link: cluster_advanced.html - :height: 150 + :height: 160 :tag: intermediate .. displayitem:: @@ -39,7 +45,7 @@ Run on an on-prem cluster :description: Learn how to integrate your own cluster :col_css: col-md-6 :button_link: cluster_expert.html - :height: 150 + :height: 160 :tag: expert .. raw:: html diff --git a/docs/source-pytorch/clouds/lightning_ai.rst b/docs/source-pytorch/clouds/lightning_ai.rst new file mode 100644 index 0000000000..4137d1ae95 --- /dev/null +++ b/docs/source-pytorch/clouds/lightning_ai.rst @@ -0,0 +1,192 @@ +:orphan: + +############################################# +Run single or multi-node on Lightning Studios +############################################# + +**Audience**: Users who don't want to waste time on cluster configuration and maintenance. + +`Lightning Studios `_ is a cloud platform where you can build, train, finetune and deploy models without worrying about infrastructure, cost management, scaling, and other technical headaches. +This guide shows you how easy it is to run a PyTorch Lightning training script across multiple machines on Lightning Studios. + + +---- + + +************* +Initial Setup +************* + +First, create a free `Lightning AI account `_. +You get free credits every month you can spend on GPU compute. +To use machines with multiple GPUs or run jobs across machines, you need to be on the `Pro or Teams plan `_. + + +---- + + +*************************************** +Launch multi-node training in the cloud +*************************************** + +**Step 1:** Start a new Studio. + +.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/videos/start-studio-for-mmt.mp4 + :width: 800 + :loop: + :muted: + +| + +**Step 2:** Bring your code into the Studio. You can clone a GitHub repo, drag and drop local files, or use the following demo example: + +.. collapse:: Code Example + + .. code-block:: python + + import lightning as L + import torch + import torch.nn.functional as F + from lightning.pytorch.demos import Transformer, WikiText2 + from torch.utils.data import DataLoader, random_split + + + class LanguageDataModule(L.LightningDataModule): + def __init__(self, batch_size): + super().__init__() + self.batch_size = batch_size + self.vocab_size = 33278 + + def prepare_data(self): + WikiText2(download=True) + + def setup(self, stage): + dataset = WikiText2() + + # Split data in to train, val, test + n = len(dataset) + self.train_dataset, self.val_dataset, self.test_dataset = random_split(dataset, [n - 4000, 2000, 2000]) + + def train_dataloader(self): + return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False) + + + class LanguageModel(L.LightningModule): + def __init__(self, vocab_size): + super().__init__() + self.vocab_size = vocab_size + self.model = None + + def configure_model(self): + if self.model is None: + self.model = Transformer(vocab_size=self.vocab_size) + + def training_step(self, batch, batch_idx): + input, target = batch + output = self.model(input, target) + loss = F.nll_loss(output, target.view(-1)) + self.log("train_loss", loss) + return loss + + def validation_step(self, batch, batch_idx): + input, target = batch + output = self.model(input, target) + loss = F.nll_loss(output, target.view(-1)) + self.log("val_loss", loss) + return loss + + def test_step(self, batch, batch_idx): + input, target = batch + output = self.model(input, target) + loss = F.nll_loss(output, target.view(-1)) + self.log("test_loss", loss) + return loss + + def configure_optimizers(self): + return torch.optim.SGD(self.parameters(), lr=0.1) + + + def main(): + L.seed_everything(42) + + datamodule = LanguageDataModule(batch_size=20) + model = LanguageModel(datamodule.vocab_size) + + # Trainer + trainer = L.Trainer(gradient_clip_val=0.25, max_epochs=2, strategy="ddp") + trainer.fit(model, datamodule=datamodule) + trainer.test(model, datamodule=datamodule) + + + if __name__ == "__main__": + main() + +| + +**Step 3:** Remove hardcoded accelerator settings if any and let Lightning automatically set them for you. No other changes are required in your script. + +.. code-block:: python + + # These are the defaults + trainer = L.Trainer(accelerator="auto", devices="auto") + + # DON'T hardcode these, leave them default/auto + # trainer = L.Trainer(accelerator="cpu", devices=3) + +| + +**Step 4:** Install dependencies and download all necessary data. Test that your script runs in the Studio first. If it runs in the Studio, it will run in multi-node! + +| + +**Step 5:** Open the Multi-Machine Training (MMT) app. Type the command to run your script, select the machine type and how many machines you want to launch it on. Click "Run" to start the job. + +.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning-ai-mmt-demo-pl.mp4 + :width: 800 + :loop: + :muted: + +After submitting the job, you will be redirected to a page where you can monitor the machine metrics and logs in real-time. + + +---- + + +**************************** +Bring your own cloud account +**************************** + +As a `Teams or Enterprise `_ customer, you have the option to connect your existing cloud account to Lightning AI. +This gives your organization the ability to keep all compute and data on your own cloud account and your Virtual Private Cloud (VPC). + + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Lightning Studios + :description: Code together. Prototype. Train. Deploy. Host AI web apps. From your browser - with zero setup. + :col_css: col-md-4 + :button_link: https://lightning.ai + :height: 150 + +.. raw:: html + +
+
+ +| diff --git a/docs/source-pytorch/common/index.rst b/docs/source-pytorch/common/index.rst index 84d4e331cf..17bab965be 100644 --- a/docs/source-pytorch/common/index.rst +++ b/docs/source-pytorch/common/index.rst @@ -112,8 +112,8 @@ How-to Guides :height: 180 .. displayitem:: - :header: Run on an on-prem cluster - :description: Learn to run on your own cluster + :header: Run on a multi-node cluster + :description: Learn to run on multi-node in the cloud or on your cluster :button_link: ../clouds/cluster.html :col_css: col-md-4 :height: 180 diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst index 7e6ed91d0c..0b9447a414 100644 --- a/docs/source-pytorch/common_usecases.rst +++ b/docs/source-pytorch/common_usecases.rst @@ -85,8 +85,8 @@ Customize and extend Lightning for things like custom hardware or distributed st :height: 100 .. displayitem:: - :header: Run on an on-prem cluster - :description: Learn to run on your own cluster + :header: Run on a multi-node cluster + :description: Learn to run multi-node in the cloud or on your cluster :col_css: col-md-12 :button_link: clouds/cluster.html :height: 100 diff --git a/docs/source-pytorch/levels/intermediate.rst b/docs/source-pytorch/levels/intermediate.rst index f7beb29788..b0eaf3352b 100644 --- a/docs/source-pytorch/levels/intermediate.rst +++ b/docs/source-pytorch/levels/intermediate.rst @@ -64,8 +64,8 @@ Learn to scale up your models and enable collaborative model development at acad :tag: intermediate .. displayitem:: - :header: Level 13: Run on on-prem clusters - :description: Run on a custom on-prem cluster or SLURM cluster. + :header: Level 13: Run on a multi-node cluster + :description: Learn to run on multi-node in the cloud or on your cluster :col_css: col-md-6 :button_link: intermediate_level_14.html :height: 150 diff --git a/docs/source-pytorch/levels/intermediate_level_14.rst b/docs/source-pytorch/levels/intermediate_level_14.rst index b73cce233f..3ced3259b1 100644 --- a/docs/source-pytorch/levels/intermediate_level_14.rst +++ b/docs/source-pytorch/levels/intermediate_level_14.rst @@ -1,10 +1,10 @@ :orphan: -################################# -Level 13: Run on on-prem clusters -################################# +##################################### +Level 13: Run on a multi-node cluster +##################################### -In this level you'll learn to run on on-prem clusters. +In this level you'll learn to run on cloud or on-prem clusters. ---- @@ -13,14 +13,21 @@ In this level you'll learn to run on on-prem clusters.
-.. Add callout items below this line + +.. displayitem:: + :header: Run single or multi-node on Lightning Studios + :description: The easiest way to scale models in the cloud. No infrastructure setup required. + :col_css: col-md-4 + :button_link: ../clouds/lightning_ai.html + :height: 160 + :tag: basic .. displayitem:: :header: Run on an on-prem cluster :description: Learn to train models on a general compute cluster. :col_css: col-md-4 :button_link: ../clouds/cluster_intermediate_1.html - :height: 150 + :height: 160 :tag: intermediate .. displayitem:: @@ -28,7 +35,7 @@ In this level you'll learn to run on on-prem clusters. :description: Run models on a SLURM-managed cluster :col_css: col-md-4 :button_link: ../clouds/cluster_advanced.html - :height: 150 + :height: 160 :tag: intermediate .. displayitem:: @@ -36,7 +43,7 @@ In this level you'll learn to run on on-prem clusters. :description: Run models on a cluster with torch distributed. :col_css: col-md-4 :button_link: ../clouds/cluster_intermediate_2.html - :height: 150 + :height: 160 :tag: intermediate .. raw:: html