lightning/.github/workflows/ci_test-mnodes.yml

name: Multi Nodes GPU Tests

# Workflow Steps:
#  1. Checkout Pytorch Lightning
#  2. Set up Python
#  3. Configure AWS Credentials
#  4. Install AWS Client
#  5. Get Current Sha Commit
#  6. Create Job Name
#  7. Update Test Configuration File
#  8. Install EKSClient
#  9. Create Gpu Node Pool
#  10. Check Current Node Pool | Current Elatic Pods
#  11. Apply Elastic
#  12. Wait 5 sec
#  13. Find ETCD TCP Address
#  14. Update Test Configuration File
#  15. Apply Multi Node Testing
#  16. Wait 120 secs
#  17. Listen to Jobs Logging
#  18. Statistics
#  19. Upload coverage results
#  20. Upload coverage to Codecov
#  21. Delete Group Node

on:
  push:
    branches:
      - never-ever-run-
  #pull_request:
  #  types: [closed]

env:
  AWS_CLUSTER: pl-lightning-torchelastic
  NODE_TYPE: g4dn.xlarge
  NODES: 2
  NUM_GPUS: 1
  REGION: us-east-2
  MAX_CHECKS: 300
  CHECK_SPEEP: 2

jobs:
  multi-nodes-gpu-testing:
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        python-version: [3.7]
        pytorch-version: [1.6]
    # Timeout: https://stackoverflow.com/a/59076067/4521646
    timeout-minutes: 50

    # runs only when merged happened.
    # if: github.event.pull_request.merged == true
    steps:

    - name: Checkout Pytorch Lightning
      uses: actions/checkout@v2
      with:
        repository: PyTorchLightning/pytorch-lightning
        ref: ${{ github.event.base_ref }}

    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}

    - name: Weekly reset caching
      run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)"
      id: times

    # Note: This uses an internal pip API and may not always work
    # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
    - name: Cache pip
      uses: actions/cache@v2
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-multi-node
        restore-keys: |
          ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-

    - name: Install dependencies
      run: |
        pip install awscli coverage

    - name: Configure AWS Credentials
      uses: aws-actions/configure-aws-credentials@v1
      with:
        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
        aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
        aws-region: us-east-2

    - name: Get Current Sha Commit
      id: vars
      shell: bash
      run: |
        echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
        echo $PWD

    - name: Create Job Name
      id: job
      shell: bash
      run: |
        echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
        echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"

    - name: Install EKSClient
      run: |
        curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
        sudo mv /tmp/eksctl /usr/local/bin
      shell: bash

    - name: Create Gpu Node Pool
      run: |
        aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
        eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE  --nodes=$NODES
        # eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE  --nodes=$NODES
      shell: bash

    - name: Check Current Node Pool | Current Elatic Pods
      run: |
        eksctl get nodegroups --cluster $AWS_CLUSTER
        kubectl get pods -n elastic-job

    - name: Apply Elastic
      run: |
        git clone https://github.com/pytorch/elastic.git
        cd elastic/kubernetes

        kubectl apply -k config/default

        kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
        kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml

    - name: Wait
      # todo: this shall be dynamic
      if: always()
      shell: bash
      run: |
        sleep 5

    - name: Find ETCD TCP Address
      id: tcp
      shell: bash
      run: |
        echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"

    - name: Update Test Config. File
      run: |
        import os
        from dtrun.configs import prepare_multi_nodes_gpu_config

        assert os.path.isfile('./tests/mnode_tests.txt')
        prepare_multi_nodes_gpu_config(
            './.github/multi-nodes-gpu.yaml',
            './tests/mnode_tests.txt',
            sha="${{ steps.vars.outputs.SHA }}",
            tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
            python_version="${{ matrix.python-version }}",
            torch_version="${{ matrix.pytorch-version }}",
            num_gpus=1,
        )
      shell: python

    - name: Apply Multi Node Testing
      run: |
        # cat ./.github/multi-nodes-gpu.yaml
        kubectl apply -f ./.github/multi-nodes-gpu.yaml
      shell: bash

    - name: Wait
      # todo: this shall be dynamic
      if: always()
      shell: bash
      run: |
        sleep 400

    - name: Listen to Jobs Logging
      shell: bash
      run: |
        # todo: Enable automatic checking.
        # while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
        # echo "Done waiting. Job status code: $status_code" && \
        kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
        if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
        cat xx00

    - name: Statistics
      if: success()
      run: |
        cat ./xx01  | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
        cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml

    - name: Upload coverage to Codecov
      uses: codecov/codecov-action@v1
      if: always()
      # see: https://github.com/actions/toolkit/issues/399
      continue-on-error: true
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        file: coverage.xml
        flags: multi-nodes,pytest
        name: multi-nodes-coverage
        fail_ci_if_error: false

    - name: Delete Group Node
      if: always()
      run: |
       kubectl delete  ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
       eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER
Mnodes (#5020) * add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> 2021-01-21 10:18:06 +00:00			`name: Multi Nodes GPU Tests`

			`# Workflow Steps:`
			`# 1. Checkout Pytorch Lightning`
			`# 2. Set up Python`
			`# 3. Configure AWS Credentials`
			`# 4. Install AWS Client`
			`# 5. Get Current Sha Commit`
			`# 6. Create Job Name`
			`# 7. Update Test Configuration File`
			`# 8. Install EKSClient`
			`# 9. Create Gpu Node Pool`
			`# 10. Check Current Node Pool \| Current Elatic Pods`
			`# 11. Apply Elastic`
			`# 12. Wait 5 sec`
			`# 13. Find ETCD TCP Address`
			`# 14. Update Test Configuration File`
			`# 15. Apply Multi Node Testing`
			`# 16. Wait 120 secs`
			`# 17. Listen to Jobs Logging`
			`# 18. Statistics`
			`# 19. Upload coverage results`
			`# 20. Upload coverage to Codecov`
			`# 21. Delete Group Node`

			`on:`
			`push:`
			`branches:`
add Azure tags trigger (#6066) * add Azure tags trigger * fix * mnodes 2021-02-18 21:41:16 +00:00			`- never-ever-run-`
			`#pull_request:`
			`# types: [closed]`
Mnodes (#5020) * add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> 2021-01-21 10:18:06 +00:00
			`env:`
			`AWS_CLUSTER: pl-lightning-torchelastic`
			`NODE_TYPE: g4dn.xlarge`
			`NODES: 2`
			`NUM_GPUS: 1`
			`REGION: us-east-2`
			`MAX_CHECKS: 300`
			`CHECK_SPEEP: 2`

			`jobs:`
			`multi-nodes-gpu-testing:`
			`runs-on: ubuntu-20.04`
			`strategy:`
			`fail-fast: false`
			`matrix:`
			`python-version: [3.7]`
Set minimum PyTorch version to 1.6 (#8288) Co-authored-by: Jirka <jirka.borovec@seznam.cz> 2021-07-13 17:12:49 +00:00			`pytorch-version: [1.6]`
Mnodes (#5020) * add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> 2021-01-21 10:18:06 +00:00			`# Timeout: https://stackoverflow.com/a/59076067/4521646`
			`timeout-minutes: 50`

			`# runs only when merged happened.`
			`# if: github.event.pull_request.merged == true`
			`steps:`

			`- name: Checkout Pytorch Lightning`
			`uses: actions/checkout@v2`
			`with:`
			`repository: PyTorchLightning/pytorch-lightning`
			`ref: ${{ github.event.base_ref }}`

			`- name: Set up Python`
			`uses: actions/setup-python@v2`
			`with:`
			`python-version: ${{ matrix.python-version }}`

CI: Reset cache weekly (#7686) * Reset cache weekly * Update ci_test-base.yml * Update docs-checks.yml * Update ci_test-mnodes.yml * Update release-pypi.yml * Remove if latest 2021-05-25 22:53:38 +00:00			`- name: Weekly reset caching`
			`run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)"`
			`id: times`

Mnodes (#5020) * add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> 2021-01-21 10:18:06 +00:00			`# Note: This uses an internal pip API and may not always work`
			`# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow`
			`- name: Cache pip`
			`uses: actions/cache@v2`
			`with:`
			`path: ~/.cache/pip`
CI: Reset cache weekly (#7686) * Reset cache weekly * Update ci_test-base.yml * Update docs-checks.yml * Update ci_test-mnodes.yml * Update release-pypi.yml * Remove if latest 2021-05-25 22:53:38 +00:00			`key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-multi-node`
Mnodes (#5020) * add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> 2021-01-21 10:18:06 +00:00			`restore-keys: \|`
CI: Reset cache weekly (#7686) * Reset cache weekly * Update ci_test-base.yml * Update docs-checks.yml * Update ci_test-mnodes.yml * Update release-pypi.yml * Remove if latest 2021-05-25 22:53:38 +00:00			`${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-`
Mnodes (#5020) * add a multi-nodesworkflow Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> 2021-01-21 10:18:06 +00:00
			`- name: Install dependencies`
			`run: \|`
			`pip install awscli coverage`

			`- name: Configure AWS Credentials`
			`uses: aws-actions/configure-aws-credentials@v1`
			`with:`
			`aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}`
			`aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}`
			`aws-region: us-east-2`

			`- name: Get Current Sha Commit`
			`id: vars`
			`shell: bash`
			`run: \|`
			`echo "::set-output name=SHA::$(git rev-parse --short HEAD)"`
			`echo $PWD`

			`- name: Create Job Name`
			`id: job`
			`shell: bash`
			`run: \|`
			`echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' \| tr . - )"`
			`echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' \| tr . - )"`

			`- name: Install EKSClient`
			`run: \|`
			`curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" \| tar xz -C /tmp`
			`sudo mv /tmp/eksctl /usr/local/bin`
			`shell: bash`

			`- name: Create Gpu Node Pool`
			`run: \|`
			`aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER`
			`eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES`
			`# eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES`
			`shell: bash`

			`- name: Check Current Node Pool \| Current Elatic Pods`
			`run: \|`
			`eksctl get nodegroups --cluster $AWS_CLUSTER`
			`kubectl get pods -n elastic-job`

			`- name: Apply Elastic`
			`run: \|`
			`git clone https://github.com/pytorch/elastic.git`
			`cd elastic/kubernetes`

			`kubectl apply -k config/default`

			`kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml`
			`kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml`

			`- name: Wait`
			`# todo: this shall be dynamic`
			`if: always()`
			`shell: bash`
			`run: \|`
			`sleep 5`

			`- name: Find ETCD TCP Address`
			`id: tcp`
			`shell: bash`
			`run: \|`
			`echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job \| grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' \| head -1)"`

			`- name: Update Test Config. File`
			`run: \|`
			`import os`
			`from dtrun.configs import prepare_multi_nodes_gpu_config`

			`assert os.path.isfile('./tests/mnode_tests.txt')`
			`prepare_multi_nodes_gpu_config(`
			`'./.github/multi-nodes-gpu.yaml',`
			`'./tests/mnode_tests.txt',`
			`sha="${{ steps.vars.outputs.SHA }}",`
			`tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",`
			`python_version="${{ matrix.python-version }}",`
			`torch_version="${{ matrix.pytorch-version }}",`
			`num_gpus=1,`
			`)`
			`shell: python`

			`- name: Apply Multi Node Testing`
			`run: \|`
			`# cat ./.github/multi-nodes-gpu.yaml`
			`kubectl apply -f ./.github/multi-nodes-gpu.yaml`
			`shell: bash`

			`- name: Wait`
			`# todo: this shall be dynamic`
			`if: always()`
			`shell: bash`
			`run: \|`
			`sleep 400`

			`- name: Listen to Jobs Logging`
			`shell: bash`
			`run: \|`
			`# todo: Enable automatic checking.`
			`# while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job \| grep -i "error\\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job \| grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \`
			`# echo "Done waiting. Job status code: $status_code" && \`
			`kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt`
			`if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \`
			`cat xx00`

			`- name: Statistics`
			`if: success()`
			`run: \|`
			`cat ./xx01 \| tail -n +2 \| base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage`
			`cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml`

			`- name: Upload coverage to Codecov`
			`uses: codecov/codecov-action@v1`
			`if: always()`
			`# see: https://github.com/actions/toolkit/issues/399`
			`continue-on-error: true`
			`with:`
			`token: ${{ secrets.CODECOV_TOKEN }}`
			`file: coverage.xml`
			`flags: multi-nodes,pytest`
			`name: multi-nodes-coverage`
			`fail_ci_if_error: false`

			`- name: Delete Group Node`
			`if: always()`
			`run: \|`
			`kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job`
			`eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER`