diff --git a/.github/workflows/ci_test-mnodes.yml b/.github/workflows/ci_test-mnodes.yml new file mode 100644 index 0000000000..259ce9642e --- /dev/null +++ b/.github/workflows/ci_test-mnodes.yml @@ -0,0 +1,212 @@ +name: Multi Nodes GPU Tests + +# Workflow Steps: +# 1. Checkout Pytorch Lightning +# 2. Set up Python +# 3. Configure AWS Credentials +# 4. Install AWS Client +# 5. Get Current Sha Commit +# 6. Create Job Name +# 7. Update Test Configuration File +# 8. Install EKSClient +# 9. Create Gpu Node Pool +# 10. Check Current Node Pool | Current Elatic Pods +# 11. Apply Elastic +# 12. Wait 5 sec +# 13. Find ETCD TCP Address +# 14. Update Test Configuration File +# 15. Apply Multi Node Testing +# 16. Wait 120 secs +# 17. Listen to Jobs Logging +# 18. Statistics +# 19. Upload coverage results +# 20. Upload coverage to Codecov +# 21. Delete Group Node + +#on: push + +on: + push: + branches: + - master + - release/* + pull_request: + types: [closed] + +env: + AWS_CLUSTER: pl-lightning-torchelastic + NODE_TYPE: g4dn.xlarge + NODES: 2 + NUM_GPUS: 1 + REGION: us-east-2 + MAX_CHECKS: 300 + CHECK_SPEEP: 2 + +jobs: + multi-nodes-gpu-testing: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + python-version: [3.7] + pytorch-version: [1.5] + # Timeout: https://stackoverflow.com/a/59076067/4521646 + timeout-minutes: 50 + + # runs only when merged happened. + # if: github.event.pull_request.merged == true + steps: + + - name: Checkout Pytorch Lightning + uses: actions/checkout@v2 + with: + repository: PyTorchLightning/pytorch-lightning + ref: ${{ github.event.base_ref }} + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + # Note: This uses an internal pip API and may not always work + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + - name: Cache pip + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-multi-node + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + pip install awscli coverage + # todo + pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.3 -q --no-cache-dir + #pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@mnodes -q --no-cache-dir + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }} + aws-region: us-east-2 + + - name: Get Current Sha Commit + id: vars + shell: bash + run: | + echo "::set-output name=SHA::$(git rev-parse --short HEAD)" + echo $PWD + + - name: Create Job Name + id: job + shell: bash + run: | + echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )" + echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )" + + - name: Install EKSClient + run: | + curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp + sudo mv /tmp/eksctl /usr/local/bin + shell: bash + + - name: Create Gpu Node Pool + run: | + aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER + eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES + # eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES + shell: bash + + - name: Check Current Node Pool | Current Elatic Pods + run: | + eksctl get nodegroups --cluster $AWS_CLUSTER + kubectl get pods -n elastic-job + + - name: Apply Elastic + run: | + git clone https://github.com/pytorch/elastic.git + cd elastic/kubernetes + + kubectl apply -k config/default + + kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml + kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml + + - name: Wait + # todo: this shall be dynamic + if: always() + shell: bash + run: | + sleep 5 + + - name: Find ETCD TCP Address + id: tcp + shell: bash + run: | + echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)" + + - name: Update Test Config. File + run: | + import os + from dtrun.configs import prepare_multi_nodes_gpu_config + + assert os.path.isfile('./tests/mnode_tests.txt') + prepare_multi_nodes_gpu_config( + './.github/multi-nodes-gpu.yaml', + './tests/mnode_tests.txt', + sha="${{ steps.vars.outputs.SHA }}", + tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}", + python_version="${{ matrix.python-version }}", + torch_version="${{ matrix.pytorch-version }}", + num_gpus=1, + ) + shell: python + + - name: Apply Multi Node Testing + run: | + # cat ./.github/multi-nodes-gpu.yaml + kubectl apply -f ./.github/multi-nodes-gpu.yaml + shell: bash + + - name: Wait + # todo: this shall be dynamic + if: always() + shell: bash + run: | + sleep 400 + + - name: Listen to Jobs Logging + shell: bash + run: | + # todo: Enable automatic checking. + # while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \ + # echo "Done waiting. Job status code: $status_code" && \ + kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt + if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \ + cat xx00 + + - name: Statistics + if: success() + run: | + cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage + cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + if: always() + # see: https://github.com/actions/toolkit/issues/399 + continue-on-error: true + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: coverage.xml + flags: multi-nodes,pytest + name: multi-nodes-coverage + fail_ci_if_error: false + + - name: Delete Group Node + if: always() + run: | + kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job + eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER diff --git a/tests/accelerators/legacy/test_multi_nodes_gpu.py b/tests/accelerators/legacy/test_multi_nodes_gpu.py new file mode 100644 index 0000000000..f17ac42fcb --- /dev/null +++ b/tests/accelerators/legacy/test_multi_nodes_gpu.py @@ -0,0 +1,126 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +import pytest +import torch + +ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") +sys.path.insert(0, ROOT) +DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + +from pytorch_lightning import LightningModule # noqa: E402 +from pytorch_lightning import Trainer +from tests.base.boring_model import BoringModel # noqa: E402 + + +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest") +def test_logging_sync_dist_true_ddp(tmpdir): + """ + Tests to ensure that the sync_dist flag works with CPU (should just return the original value) + """ + fake_result = 1 + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True) + return acc + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True) + return {"x": loss} + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + weights_summary=None, + accelerator="ddp", + gpus=1, + num_nodes=2, + ) + trainer.fit(model) + + assert trainer.logged_metrics['foo'] == fake_result + assert trainer.logged_metrics['bar'] == fake_result + + +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest") +def test__validation_step__log(tmpdir): + """ + Tests that validation_step can log + """ + os.environ['PL_DEV_DEBUG'] = '1' + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch) + acc = acc + batch_idx + self.log('a', acc, on_step=True, on_epoch=True) + self.log('a2', 2) + + self.training_step_called = True + return acc + + def validation_step(self, batch, batch_idx): + acc = self.step(batch) + acc = acc + batch_idx + self.log('b', acc, on_step=True, on_epoch=True) + self.training_step_called = True + + def backward(self, loss, optimizer, optimizer_idx): + return LightningModule.backward(self, loss, optimizer, optimizer_idx) + + model = TestModel() + model.validation_step_end = None + model.validation_epoch_end = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=2, + log_every_n_steps=1, + weights_summary=None, + accelerator="ddp", + gpus=1, + num_nodes=2, + ) + trainer.fit(model) + + # make sure all the metrics are available for callbacks + expected_logged_metrics = { + 'a2', + 'a_step', + 'a_epoch', + 'b_step/epoch_0', + 'b_step/epoch_1', + 'b_epoch', + 'epoch', + } + logged_metrics = set(trainer.logged_metrics.keys()) + assert expected_logged_metrics == logged_metrics + + # we don't want to enable val metrics during steps because it is not something that users should do + # on purpose DO NOT allow step_b... it's silly to monitor val step metrics + callback_metrics = set(trainer.callback_metrics.keys()) + callback_metrics.remove('debug_epoch') + expected_cb_metrics = {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'} + assert expected_cb_metrics == callback_metrics diff --git a/tests/mnode_tests.txt b/tests/mnode_tests.txt new file mode 100644 index 0000000000..77a3ed58db --- /dev/null +++ b/tests/mnode_tests.txt @@ -0,0 +1,2 @@ +./tests/backends/test_multi_nodes_gpu.py::test_logging_sync_dist_true_ddp +./tests/backends/test_multi_nodes_gpu.py::test__validation_step__log