lightning/.github/workflows/ci_test-mnodes.yml

211 lines
7.0 KiB
YAML

name: Multi Nodes GPU Tests
# Workflow Steps:
# 1. Checkout Pytorch Lightning
# 2. Set up Python
# 3. Configure AWS Credentials
# 4. Install AWS Client
# 5. Get Current Sha Commit
# 6. Create Job Name
# 7. Update Test Configuration File
# 8. Install EKSClient
# 9. Create Gpu Node Pool
# 10. Check Current Node Pool | Current Elatic Pods
# 11. Apply Elastic
# 12. Wait 5 sec
# 13. Find ETCD TCP Address
# 14. Update Test Configuration File
# 15. Apply Multi Node Testing
# 16. Wait 120 secs
# 17. Listen to Jobs Logging
# 18. Statistics
# 19. Upload coverage results
# 20. Upload coverage to Codecov
# 21. Delete Group Node
on:
push:
branches:
- never-ever-run-
#pull_request:
# types: [closed]
env:
AWS_CLUSTER: pl-lightning-torchelastic
NODE_TYPE: g4dn.xlarge
NODES: 2
NUM_GPUS: 1
REGION: us-east-2
MAX_CHECKS: 300
CHECK_SPEEP: 2
jobs:
multi-nodes-gpu-testing:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python-version: [3.7]
pytorch-version: [1.6]
# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes: 50
# runs only when merged happened.
# if: github.event.pull_request.merged == true
steps:
- name: Checkout Pytorch Lightning
uses: actions/checkout@v2
with:
repository: PyTorchLightning/pytorch-lightning
ref: ${{ github.event.base_ref }}
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Weekly reset caching
run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)"
id: times
# Note: This uses an internal pip API and may not always work
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
- name: Cache pip
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-multi-node
restore-keys: |
${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-
- name: Install dependencies
run: |
pip install awscli coverage
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
aws-region: us-east-2
- name: Get Current Sha Commit
id: vars
shell: bash
run: |
echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
echo $PWD
- name: Create Job Name
id: job
shell: bash
run: |
echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"
- name: Install EKSClient
run: |
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
shell: bash
- name: Create Gpu Node Pool
run: |
aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES
# eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES
shell: bash
- name: Check Current Node Pool | Current Elatic Pods
run: |
eksctl get nodegroups --cluster $AWS_CLUSTER
kubectl get pods -n elastic-job
- name: Apply Elastic
run: |
git clone https://github.com/pytorch/elastic.git
cd elastic/kubernetes
kubectl apply -k config/default
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml
- name: Wait
# todo: this shall be dynamic
if: always()
shell: bash
run: |
sleep 5
- name: Find ETCD TCP Address
id: tcp
shell: bash
run: |
echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"
- name: Update Test Config. File
run: |
import os
from dtrun.configs import prepare_multi_nodes_gpu_config
assert os.path.isfile('./tests/mnode_tests.txt')
prepare_multi_nodes_gpu_config(
'./.github/multi-nodes-gpu.yaml',
'./tests/mnode_tests.txt',
sha="${{ steps.vars.outputs.SHA }}",
tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
python_version="${{ matrix.python-version }}",
torch_version="${{ matrix.pytorch-version }}",
num_gpus=1,
)
shell: python
- name: Apply Multi Node Testing
run: |
# cat ./.github/multi-nodes-gpu.yaml
kubectl apply -f ./.github/multi-nodes-gpu.yaml
shell: bash
- name: Wait
# todo: this shall be dynamic
if: always()
shell: bash
run: |
sleep 400
- name: Listen to Jobs Logging
shell: bash
run: |
# todo: Enable automatic checking.
# while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
# echo "Done waiting. Job status code: $status_code" && \
kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
cat xx00
- name: Statistics
if: success()
run: |
cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
if: always()
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: multi-nodes,pytest
name: multi-nodes-coverage
fail_ci_if_error: false
- name: Delete Group Node
if: always()
run: |
kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER