Mnodes (#5020)
* add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
This commit is contained in:
parent
3b43e5a4fc
commit
e8206a9295
|
@ -0,0 +1,212 @@
|
||||||
|
name: Multi Nodes GPU Tests
|
||||||
|
|
||||||
|
# Workflow Steps:
|
||||||
|
# 1. Checkout Pytorch Lightning
|
||||||
|
# 2. Set up Python
|
||||||
|
# 3. Configure AWS Credentials
|
||||||
|
# 4. Install AWS Client
|
||||||
|
# 5. Get Current Sha Commit
|
||||||
|
# 6. Create Job Name
|
||||||
|
# 7. Update Test Configuration File
|
||||||
|
# 8. Install EKSClient
|
||||||
|
# 9. Create Gpu Node Pool
|
||||||
|
# 10. Check Current Node Pool | Current Elatic Pods
|
||||||
|
# 11. Apply Elastic
|
||||||
|
# 12. Wait 5 sec
|
||||||
|
# 13. Find ETCD TCP Address
|
||||||
|
# 14. Update Test Configuration File
|
||||||
|
# 15. Apply Multi Node Testing
|
||||||
|
# 16. Wait 120 secs
|
||||||
|
# 17. Listen to Jobs Logging
|
||||||
|
# 18. Statistics
|
||||||
|
# 19. Upload coverage results
|
||||||
|
# 20. Upload coverage to Codecov
|
||||||
|
# 21. Delete Group Node
|
||||||
|
|
||||||
|
#on: push
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
- release/*
|
||||||
|
pull_request:
|
||||||
|
types: [closed]
|
||||||
|
|
||||||
|
env:
|
||||||
|
AWS_CLUSTER: pl-lightning-torchelastic
|
||||||
|
NODE_TYPE: g4dn.xlarge
|
||||||
|
NODES: 2
|
||||||
|
NUM_GPUS: 1
|
||||||
|
REGION: us-east-2
|
||||||
|
MAX_CHECKS: 300
|
||||||
|
CHECK_SPEEP: 2
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
multi-nodes-gpu-testing:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.7]
|
||||||
|
pytorch-version: [1.5]
|
||||||
|
# Timeout: https://stackoverflow.com/a/59076067/4521646
|
||||||
|
timeout-minutes: 50
|
||||||
|
|
||||||
|
# runs only when merged happened.
|
||||||
|
# if: github.event.pull_request.merged == true
|
||||||
|
steps:
|
||||||
|
|
||||||
|
- name: Checkout Pytorch Lightning
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: PyTorchLightning/pytorch-lightning
|
||||||
|
ref: ${{ github.event.base_ref }}
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
# Note: This uses an internal pip API and may not always work
|
||||||
|
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
|
||||||
|
- name: Cache pip
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pip
|
||||||
|
key: ${{ runner.os }}-pip-multi-node
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-pip-
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install awscli coverage
|
||||||
|
# todo
|
||||||
|
pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.3 -q --no-cache-dir
|
||||||
|
#pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@mnodes -q --no-cache-dir
|
||||||
|
|
||||||
|
- name: Configure AWS Credentials
|
||||||
|
uses: aws-actions/configure-aws-credentials@v1
|
||||||
|
with:
|
||||||
|
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||||
|
aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
|
||||||
|
aws-region: us-east-2
|
||||||
|
|
||||||
|
- name: Get Current Sha Commit
|
||||||
|
id: vars
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
|
||||||
|
echo $PWD
|
||||||
|
|
||||||
|
- name: Create Job Name
|
||||||
|
id: job
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
|
||||||
|
echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"
|
||||||
|
|
||||||
|
- name: Install EKSClient
|
||||||
|
run: |
|
||||||
|
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
|
||||||
|
sudo mv /tmp/eksctl /usr/local/bin
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Create Gpu Node Pool
|
||||||
|
run: |
|
||||||
|
aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
|
||||||
|
eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES
|
||||||
|
# eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Check Current Node Pool | Current Elatic Pods
|
||||||
|
run: |
|
||||||
|
eksctl get nodegroups --cluster $AWS_CLUSTER
|
||||||
|
kubectl get pods -n elastic-job
|
||||||
|
|
||||||
|
- name: Apply Elastic
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/pytorch/elastic.git
|
||||||
|
cd elastic/kubernetes
|
||||||
|
|
||||||
|
kubectl apply -k config/default
|
||||||
|
|
||||||
|
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
|
||||||
|
kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml
|
||||||
|
|
||||||
|
- name: Wait
|
||||||
|
# todo: this shall be dynamic
|
||||||
|
if: always()
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
- name: Find ETCD TCP Address
|
||||||
|
id: tcp
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"
|
||||||
|
|
||||||
|
- name: Update Test Config. File
|
||||||
|
run: |
|
||||||
|
import os
|
||||||
|
from dtrun.configs import prepare_multi_nodes_gpu_config
|
||||||
|
|
||||||
|
assert os.path.isfile('./tests/mnode_tests.txt')
|
||||||
|
prepare_multi_nodes_gpu_config(
|
||||||
|
'./.github/multi-nodes-gpu.yaml',
|
||||||
|
'./tests/mnode_tests.txt',
|
||||||
|
sha="${{ steps.vars.outputs.SHA }}",
|
||||||
|
tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
|
||||||
|
python_version="${{ matrix.python-version }}",
|
||||||
|
torch_version="${{ matrix.pytorch-version }}",
|
||||||
|
num_gpus=1,
|
||||||
|
)
|
||||||
|
shell: python
|
||||||
|
|
||||||
|
- name: Apply Multi Node Testing
|
||||||
|
run: |
|
||||||
|
# cat ./.github/multi-nodes-gpu.yaml
|
||||||
|
kubectl apply -f ./.github/multi-nodes-gpu.yaml
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Wait
|
||||||
|
# todo: this shall be dynamic
|
||||||
|
if: always()
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sleep 400
|
||||||
|
|
||||||
|
- name: Listen to Jobs Logging
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
# todo: Enable automatic checking.
|
||||||
|
# while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
|
||||||
|
# echo "Done waiting. Job status code: $status_code" && \
|
||||||
|
kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
|
||||||
|
if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
|
||||||
|
cat xx00
|
||||||
|
|
||||||
|
- name: Statistics
|
||||||
|
if: success()
|
||||||
|
run: |
|
||||||
|
cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
|
||||||
|
cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml
|
||||||
|
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v1
|
||||||
|
if: always()
|
||||||
|
# see: https://github.com/actions/toolkit/issues/399
|
||||||
|
continue-on-error: true
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
file: coverage.xml
|
||||||
|
flags: multi-nodes,pytest
|
||||||
|
name: multi-nodes-coverage
|
||||||
|
fail_ci_if_error: false
|
||||||
|
|
||||||
|
- name: Delete Group Node
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
|
||||||
|
eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER
|
|
@ -0,0 +1,126 @@
|
||||||
|
# Copyright The PyTorch Lightning team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
|
||||||
|
sys.path.insert(0, ROOT)
|
||||||
|
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
from pytorch_lightning import LightningModule # noqa: E402
|
||||||
|
from pytorch_lightning import Trainer
|
||||||
|
from tests.base.boring_model import BoringModel # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest")
|
||||||
|
def test_logging_sync_dist_true_ddp(tmpdir):
|
||||||
|
"""
|
||||||
|
Tests to ensure that the sync_dist flag works with CPU (should just return the original value)
|
||||||
|
"""
|
||||||
|
fake_result = 1
|
||||||
|
|
||||||
|
class TestModel(BoringModel):
|
||||||
|
def training_step(self, batch, batch_idx):
|
||||||
|
acc = self.step(batch[0])
|
||||||
|
self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True)
|
||||||
|
return acc
|
||||||
|
|
||||||
|
def validation_step(self, batch, batch_idx):
|
||||||
|
output = self.layer(batch)
|
||||||
|
loss = self.loss(batch, output)
|
||||||
|
self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True)
|
||||||
|
return {"x": loss}
|
||||||
|
|
||||||
|
model = TestModel()
|
||||||
|
trainer = Trainer(
|
||||||
|
default_root_dir=tmpdir,
|
||||||
|
limit_train_batches=1,
|
||||||
|
limit_val_batches=1,
|
||||||
|
max_epochs=2,
|
||||||
|
weights_summary=None,
|
||||||
|
accelerator="ddp",
|
||||||
|
gpus=1,
|
||||||
|
num_nodes=2,
|
||||||
|
)
|
||||||
|
trainer.fit(model)
|
||||||
|
|
||||||
|
assert trainer.logged_metrics['foo'] == fake_result
|
||||||
|
assert trainer.logged_metrics['bar'] == fake_result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest")
|
||||||
|
def test__validation_step__log(tmpdir):
|
||||||
|
"""
|
||||||
|
Tests that validation_step can log
|
||||||
|
"""
|
||||||
|
os.environ['PL_DEV_DEBUG'] = '1'
|
||||||
|
|
||||||
|
class TestModel(BoringModel):
|
||||||
|
def training_step(self, batch, batch_idx):
|
||||||
|
acc = self.step(batch)
|
||||||
|
acc = acc + batch_idx
|
||||||
|
self.log('a', acc, on_step=True, on_epoch=True)
|
||||||
|
self.log('a2', 2)
|
||||||
|
|
||||||
|
self.training_step_called = True
|
||||||
|
return acc
|
||||||
|
|
||||||
|
def validation_step(self, batch, batch_idx):
|
||||||
|
acc = self.step(batch)
|
||||||
|
acc = acc + batch_idx
|
||||||
|
self.log('b', acc, on_step=True, on_epoch=True)
|
||||||
|
self.training_step_called = True
|
||||||
|
|
||||||
|
def backward(self, loss, optimizer, optimizer_idx):
|
||||||
|
return LightningModule.backward(self, loss, optimizer, optimizer_idx)
|
||||||
|
|
||||||
|
model = TestModel()
|
||||||
|
model.validation_step_end = None
|
||||||
|
model.validation_epoch_end = None
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
default_root_dir=tmpdir,
|
||||||
|
limit_train_batches=2,
|
||||||
|
limit_val_batches=2,
|
||||||
|
max_epochs=2,
|
||||||
|
log_every_n_steps=1,
|
||||||
|
weights_summary=None,
|
||||||
|
accelerator="ddp",
|
||||||
|
gpus=1,
|
||||||
|
num_nodes=2,
|
||||||
|
)
|
||||||
|
trainer.fit(model)
|
||||||
|
|
||||||
|
# make sure all the metrics are available for callbacks
|
||||||
|
expected_logged_metrics = {
|
||||||
|
'a2',
|
||||||
|
'a_step',
|
||||||
|
'a_epoch',
|
||||||
|
'b_step/epoch_0',
|
||||||
|
'b_step/epoch_1',
|
||||||
|
'b_epoch',
|
||||||
|
'epoch',
|
||||||
|
}
|
||||||
|
logged_metrics = set(trainer.logged_metrics.keys())
|
||||||
|
assert expected_logged_metrics == logged_metrics
|
||||||
|
|
||||||
|
# we don't want to enable val metrics during steps because it is not something that users should do
|
||||||
|
# on purpose DO NOT allow step_b... it's silly to monitor val step metrics
|
||||||
|
callback_metrics = set(trainer.callback_metrics.keys())
|
||||||
|
callback_metrics.remove('debug_epoch')
|
||||||
|
expected_cb_metrics = {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'}
|
||||||
|
assert expected_cb_metrics == callback_metrics
|
|
@ -0,0 +1,2 @@
|
||||||
|
./tests/backends/test_multi_nodes_gpu.py::test_logging_sync_dist_true_ddp
|
||||||
|
./tests/backends/test_multi_nodes_gpu.py::test__validation_step__log
|
Loading…
Reference in New Issue