CI: Use self-hosted Azure GPU runners (#14632)

* move config * Apply suggestions from code review Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com> Co-authored-by: otaj <6065855+otaj@users.noreply.github.com>
2022-10-05 12:43:54 +02:00 · 2022-10-05 12:43:54 +02:00 · 5f106957f7
parent 0a9fc22b4f
commit 5f106957f7
7 changed files with 81 additions and 108 deletions
--- a/.azure/README.md
+++ b/.azure/README.md
@ -0,0 +1,46 @@
+# Creation GPU self-hosted agent pool
+
+## Prepare the machine
+
+This is a slightly modified version of the script from
+https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
+
+```bash
+apt-get update
+apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    jq \
+    git \
+    iputils-ping \
+    libcurl4 \
+    libunwind8 \
+    netcat \
+    libssl1.0
+
+curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+mkdir /azp
+```
+
+## Stating the agents
+
+```bash
+export TARGETARCH=linux-x64
+export AZP_URL="https://dev.azure.com/Lightning-AI"
+export AZP_TOKEN="xxxxxxxxxxxxxxxxxxxxxxxxxx"
+export AZP_POOL="lit-rtx-3090"
+
+for i in {0..7..2}
+do
+     nohup bash .azure/start.sh \
+        "AZP_AGENT_NAME=litGPU-YX_$i,$((i+1))" \
+        "CUDA_VISIBLE_DEVICES=$i,$((i+1))" \
+     > "agent-$i.log" &
+done
+```
+
+## Check running agents
+
+```bash
+ps aux | grep start.sh
+```
--- a/.azure/gpu-tests-lite.yml
+++ b/.azure/gpu-tests-lite.yml
@ -41,12 +41,14 @@ jobs:
    timeoutInMinutes: "20"
    # how much time to give 'run always even if cancelled tasks' before stopping them
    cancelTimeoutInMinutes: "2"
-    pool: azure-jirka-spot
+    pool: lit-rtx-3090
+    variables:
+      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
    container:
      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
      # default shm size is 64m. Increase it to avoid:
      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
+      options: "--gpus=all --shm-size=2gb"
    workspace:
      clean: all

@ -61,6 +63,10 @@ jobs:
        pip list
      displayName: 'Image info & NVIDIA'

+    - bash: |
+        echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+      displayName: 'set visible devices'
+
    - bash: |
        set -e
        PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
@ -78,8 +84,9 @@ jobs:

    - bash: |
        set -e
+        echo $CUDA_VISIBLE_DEVICES
        python requirements/collect_env_details.py
-        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
+        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
      displayName: 'Env details'

    - bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@ -67,12 +67,14 @@ jobs:
    timeoutInMinutes: "80"
    # how much time to give 'run always even if cancelled tasks' before stopping them
    cancelTimeoutInMinutes: "2"
-    pool: azure-jirka-spot
+    pool: lit-rtx-3090
+    variables:
+      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
    container:
      image: $(image)
      # default shm size is 64m. Increase it to avoid:
      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
+      options: "--gpus=all --shm-size=2gb"
    workspace:
      clean: all

@ -87,6 +89,10 @@ jobs:
        pip list
      displayName: 'Image info & NVIDIA'

+    - bash: |
+        echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+      displayName: 'set visible devices'
+
    - bash: |
        set -e
        python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
@ -112,8 +118,9 @@ jobs:

    - bash: |
        set -e
+        echo $CUDA_VISIBLE_DEVICES
        python requirements/collect_env_details.py
-        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
+        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
        python requirements/pytorch/check-avail-strategies.py
        python requirements/pytorch/check-avail-extras.py
      displayName: 'Env details'
--- a/dockers/ci-runner-hpu/start.sh
+++ b/dockers/ci-runner-hpu/start.sh
@ -5,6 +5,15 @@

 set -e

+# export all args as env variables
+for var in "$@"
+do
+    echo "$var"
+    eval "export $var"
+done
+
+printenv
+
 if [ -z "$AZP_URL" ]; then
  echo 1>&2 "error: missing AZP_URL environment variable"
  exit 1
@ -26,9 +35,9 @@ if [ -n "$AZP_WORK" ]; then
  mkdir -p "$AZP_WORK"
 fi

-rm -rf /azp/agent
-mkdir /azp/agent
-cd /azp/agent
+rm -rf /azp/agent-$AZP_AGENT_NAME
+mkdir /azp/agent-$AZP_AGENT_NAME
+cd /azp/agent-$AZP_AGENT_NAME

 export AGENT_ALLOW_RUNASROOT="1"

@ -74,7 +83,7 @@ curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!

 source ./env.sh

-print_header "3. Configuring Azure Pipelines agent..."
+print_header "3. Configuring Azure Pipelines agent $AZP_AGENT_NAME..."

 ./config.sh --unattended \
  --agent "${AZP_AGENT_NAME:-$(hostname)}" \
--- a/dockers/ci-runner-hpu/Dockerfile
+++ b/dockers/ci-runner-hpu/Dockerfile
@ -59,7 +59,7 @@ RUN pip uninstall pytorch-lightning -y

 WORKDIR /azp

-COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
+COPY ./.azure/start.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/start.sh

 ENTRYPOINT ["/usr/local/bin/start.sh"]
--- a/dockers/ci-runner-ipu/Dockerfile
+++ b/dockers/ci-runner-ipu/Dockerfile
@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

 WORKDIR /azp

-COPY ./dockers/ci-runner-ipu/start.sh /usr/local/bin/
+COPY ./.azure/start.sh /usr/local/bin/

 RUN curl -o /usr/local/bin/installdependencies.sh \
    "https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \
--- a/dockers/ci-runner-ipu/start.sh
+++ b/dockers/ci-runner-ipu/start.sh
@ -1,96 +0,0 @@
-#!/bin/bash
-
-# This is a slightly modified version of the script from
-# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
-
-set -e
-
-if [ -z "$AZP_URL" ]; then
-  echo 1>&2 "error: missing AZP_URL environment variable"
-  exit 1
-fi
-
-if [ -z "$AZP_TOKEN_FILE" ]; then
-  if [ -z "$AZP_TOKEN" ]; then
-    echo 1>&2 "error: missing AZP_TOKEN environment variable"
-    exit 1
-  fi
-
-  AZP_TOKEN_FILE=/azp/.token
-  echo -n $AZP_TOKEN > "$AZP_TOKEN_FILE"
-fi
-
-unset AZP_TOKEN
-
-if [ -n "$AZP_WORK" ]; then
-  mkdir -p "$AZP_WORK"
-fi
-
-rm -rf /azp/agent
-mkdir /azp/agent
-cd /azp/agent
-
-export AGENT_ALLOW_RUNASROOT="1"
-
-cleanup() {
-  if [ -e config.sh ]; then
-    print_header "Cleanup. Removing Azure Pipelines agent..."
-
-    ./config.sh remove --unattended \
-      --auth PAT \
-      --token $(cat "$AZP_TOKEN_FILE")
-  fi
-}
-
-print_header() {
-  lightcyan='\033[1;36m'
-  nocolor='\033[0m'
-  echo -e "${lightcyan}$1${nocolor}"
-}
-
-# Let the agent ignore the token env variables
-export VSO_AGENT_IGNORE=AZP_TOKEN,AZP_TOKEN_FILE
-
-print_header "1. Determining matching Azure Pipelines agent..."
-
-AZP_AGENT_RESPONSE=$(curl -LsS \
-  -u user:$(cat "$AZP_TOKEN_FILE") \
-  -H 'Accept:application/json;api-version=3.0-preview' \
-  "$AZP_URL/_apis/distributedtask/packages/agent?platform=linux-x64")
-
-if echo "$AZP_AGENT_RESPONSE" | jq . >/dev/null 2>&1; then
-  AZP_AGENTPACKAGE_URL=$(echo "$AZP_AGENT_RESPONSE" \
-    | jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]')
-fi
-
-if [ -z "$AZP_AGENTPACKAGE_URL" -o "$AZP_AGENTPACKAGE_URL" == "null" ]; then
-  echo 1>&2 "error: could not determine a matching Azure Pipelines agent - check that account '$AZP_URL' is correct and the token is valid for that account"
-  exit 1
-fi
-
-print_header "2. Downloading and installing Azure Pipelines agent..."
-
-curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
-
-source ./env.sh
-
-print_header "3. Configuring Azure Pipelines agent..."
-
-./config.sh --unattended \
-  --agent "${AZP_AGENT_NAME:-$(hostname)}" \
-  --url "$AZP_URL" \
-  --auth PAT \
-  --token $(cat "$AZP_TOKEN_FILE") \
-  --pool "${AZP_POOL:-Default}" \
-  --work "${AZP_WORK:-_work}" \
-  --replace \
-  --acceptTeeEula & wait $!
-
-print_header "4. Running Azure Pipelines agent..."
-
-trap 'cleanup; exit 130' INT
-trap 'cleanup; exit 143' TERM
-
-# To be aware of TERM and INT signals call run.sh
-# Running it with the --once flag at the end will shut down the agent after the build is executed
-./run.sh --once & wait $!