diff --git a/.azure/README.md b/.azure/README.md new file mode 100644 index 0000000000..917532b083 --- /dev/null +++ b/.azure/README.md @@ -0,0 +1,46 @@ +# Creation GPU self-hosted agent pool + +## Prepare the machine + +This is a slightly modified version of the script from +https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker + +```bash +apt-get update +apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + jq \ + git \ + iputils-ping \ + libcurl4 \ + libunwind8 \ + netcat \ + libssl1.0 + +curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +mkdir /azp +``` + +## Stating the agents + +```bash +export TARGETARCH=linux-x64 +export AZP_URL="https://dev.azure.com/Lightning-AI" +export AZP_TOKEN="xxxxxxxxxxxxxxxxxxxxxxxxxx" +export AZP_POOL="lit-rtx-3090" + +for i in {0..7..2} +do + nohup bash .azure/start.sh \ + "AZP_AGENT_NAME=litGPU-YX_$i,$((i+1))" \ + "CUDA_VISIBLE_DEVICES=$i,$((i+1))" \ + > "agent-$i.log" & +done +``` + +## Check running agents + +```bash +ps aux | grep start.sh +``` diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index bf0ed0a0b9..e439418c09 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -41,12 +41,14 @@ jobs: timeoutInMinutes: "20" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot + pool: lit-rtx-3090 + variables: + DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + options: "--gpus=all --shm-size=2gb" workspace: clean: all @@ -61,6 +63,10 @@ jobs: pip list displayName: 'Image info & NVIDIA' + - bash: | + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + displayName: 'set visible devices' + - bash: | set -e PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") @@ -78,8 +84,9 @@ jobs: - bash: | set -e + echo $CUDA_VISIBLE_DEVICES python requirements/collect_env_details.py - python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" displayName: 'Env details' - bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 42c153a5b5..b2104780f1 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -67,12 +67,14 @@ jobs: timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot + pool: lit-rtx-3090 + variables: + DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: image: $(image) # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + options: "--gpus=all --shm-size=2gb" workspace: clean: all @@ -87,6 +89,10 @@ jobs: pip list displayName: 'Image info & NVIDIA' + - bash: | + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + displayName: 'set visible devices' + - bash: | set -e python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" @@ -112,8 +118,9 @@ jobs: - bash: | set -e + echo $CUDA_VISIBLE_DEVICES python requirements/collect_env_details.py - python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' diff --git a/dockers/ci-runner-hpu/start.sh b/.azure/start.sh similarity index 89% rename from dockers/ci-runner-hpu/start.sh rename to .azure/start.sh index 82472a817a..adaa6735ae 100644 --- a/dockers/ci-runner-hpu/start.sh +++ b/.azure/start.sh @@ -5,6 +5,15 @@ set -e +# export all args as env variables +for var in "$@" +do + echo "$var" + eval "export $var" +done + +printenv + if [ -z "$AZP_URL" ]; then echo 1>&2 "error: missing AZP_URL environment variable" exit 1 @@ -26,9 +35,9 @@ if [ -n "$AZP_WORK" ]; then mkdir -p "$AZP_WORK" fi -rm -rf /azp/agent -mkdir /azp/agent -cd /azp/agent +rm -rf /azp/agent-$AZP_AGENT_NAME +mkdir /azp/agent-$AZP_AGENT_NAME +cd /azp/agent-$AZP_AGENT_NAME export AGENT_ALLOW_RUNASROOT="1" @@ -74,7 +83,7 @@ curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $! source ./env.sh -print_header "3. Configuring Azure Pipelines agent..." +print_header "3. Configuring Azure Pipelines agent $AZP_AGENT_NAME..." ./config.sh --unattended \ --agent "${AZP_AGENT_NAME:-$(hostname)}" \ diff --git a/dockers/ci-runner-hpu/Dockerfile b/dockers/ci-runner-hpu/Dockerfile index d1868fd6ce..548e6de30f 100644 --- a/dockers/ci-runner-hpu/Dockerfile +++ b/dockers/ci-runner-hpu/Dockerfile @@ -59,7 +59,7 @@ RUN pip uninstall pytorch-lightning -y WORKDIR /azp -COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ +COPY ./.azure/start.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/start.sh ENTRYPOINT ["/usr/local/bin/start.sh"] diff --git a/dockers/ci-runner-ipu/Dockerfile b/dockers/ci-runner-ipu/Dockerfile index 1d1a41ab20..eed18b1596 100644 --- a/dockers/ci-runner-ipu/Dockerfile +++ b/dockers/ci-runner-ipu/Dockerfile @@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers WORKDIR /azp -COPY ./dockers/ci-runner-ipu/start.sh /usr/local/bin/ +COPY ./.azure/start.sh /usr/local/bin/ RUN curl -o /usr/local/bin/installdependencies.sh \ "https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \ diff --git a/dockers/ci-runner-ipu/start.sh b/dockers/ci-runner-ipu/start.sh deleted file mode 100644 index caa452b978..0000000000 --- a/dockers/ci-runner-ipu/start.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash - -# This is a slightly modified version of the script from -# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker - -set -e - -if [ -z "$AZP_URL" ]; then - echo 1>&2 "error: missing AZP_URL environment variable" - exit 1 -fi - -if [ -z "$AZP_TOKEN_FILE" ]; then - if [ -z "$AZP_TOKEN" ]; then - echo 1>&2 "error: missing AZP_TOKEN environment variable" - exit 1 - fi - - AZP_TOKEN_FILE=/azp/.token - echo -n $AZP_TOKEN > "$AZP_TOKEN_FILE" -fi - -unset AZP_TOKEN - -if [ -n "$AZP_WORK" ]; then - mkdir -p "$AZP_WORK" -fi - -rm -rf /azp/agent -mkdir /azp/agent -cd /azp/agent - -export AGENT_ALLOW_RUNASROOT="1" - -cleanup() { - if [ -e config.sh ]; then - print_header "Cleanup. Removing Azure Pipelines agent..." - - ./config.sh remove --unattended \ - --auth PAT \ - --token $(cat "$AZP_TOKEN_FILE") - fi -} - -print_header() { - lightcyan='\033[1;36m' - nocolor='\033[0m' - echo -e "${lightcyan}$1${nocolor}" -} - -# Let the agent ignore the token env variables -export VSO_AGENT_IGNORE=AZP_TOKEN,AZP_TOKEN_FILE - -print_header "1. Determining matching Azure Pipelines agent..." - -AZP_AGENT_RESPONSE=$(curl -LsS \ - -u user:$(cat "$AZP_TOKEN_FILE") \ - -H 'Accept:application/json;api-version=3.0-preview' \ - "$AZP_URL/_apis/distributedtask/packages/agent?platform=linux-x64") - -if echo "$AZP_AGENT_RESPONSE" | jq . >/dev/null 2>&1; then - AZP_AGENTPACKAGE_URL=$(echo "$AZP_AGENT_RESPONSE" \ - | jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]') -fi - -if [ -z "$AZP_AGENTPACKAGE_URL" -o "$AZP_AGENTPACKAGE_URL" == "null" ]; then - echo 1>&2 "error: could not determine a matching Azure Pipelines agent - check that account '$AZP_URL' is correct and the token is valid for that account" - exit 1 -fi - -print_header "2. Downloading and installing Azure Pipelines agent..." - -curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $! - -source ./env.sh - -print_header "3. Configuring Azure Pipelines agent..." - -./config.sh --unattended \ - --agent "${AZP_AGENT_NAME:-$(hostname)}" \ - --url "$AZP_URL" \ - --auth PAT \ - --token $(cat "$AZP_TOKEN_FILE") \ - --pool "${AZP_POOL:-Default}" \ - --work "${AZP_WORK:-_work}" \ - --replace \ - --acceptTeeEula & wait $! - -print_header "4. Running Azure Pipelines agent..." - -trap 'cleanup; exit 130' INT -trap 'cleanup; exit 143' TERM - -# To be aware of TERM and INT signals call run.sh -# Running it with the --once flag at the end will shut down the agent after the build is executed -./run.sh --once & wait $!