CI: Use self-hosted Azure GPU runners (#14632)

* move config
* Apply suggestions from code review

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
Co-authored-by: otaj <6065855+otaj@users.noreply.github.com>
This commit is contained in:
Jirka Borovec 2022-10-05 12:43:54 +02:00 committed by GitHub
parent 0a9fc22b4f
commit 5f106957f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 81 additions and 108 deletions

46
.azure/README.md Normal file
View File

@ -0,0 +1,46 @@
# Creation GPU self-hosted agent pool
## Prepare the machine
This is a slightly modified version of the script from
https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
```bash
apt-get update
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
jq \
git \
iputils-ping \
libcurl4 \
libunwind8 \
netcat \
libssl1.0
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
mkdir /azp
```
## Stating the agents
```bash
export TARGETARCH=linux-x64
export AZP_URL="https://dev.azure.com/Lightning-AI"
export AZP_TOKEN="xxxxxxxxxxxxxxxxxxxxxxxxxx"
export AZP_POOL="lit-rtx-3090"
for i in {0..7..2}
do
nohup bash .azure/start.sh \
"AZP_AGENT_NAME=litGPU-YX_$i,$((i+1))" \
"CUDA_VISIBLE_DEVICES=$i,$((i+1))" \
> "agent-$i.log" &
done
```
## Check running agents
```bash
ps aux | grep start.sh
```

View File

@ -41,12 +41,14 @@ jobs:
timeoutInMinutes: "20"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
pool: lit-rtx-3090
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
options: "--gpus=all --shm-size=2gb"
workspace:
clean: all
@ -61,6 +63,10 @@ jobs:
pip list
displayName: 'Image info & NVIDIA'
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
displayName: 'set visible devices'
- bash: |
set -e
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
@ -78,8 +84,9 @@ jobs:
- bash: |
set -e
echo $CUDA_VISIBLE_DEVICES
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
- bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50

View File

@ -67,12 +67,14 @@ jobs:
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
pool: lit-rtx-3090
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
options: "--gpus=all --shm-size=2gb"
workspace:
clean: all
@ -87,6 +89,10 @@ jobs:
pip list
displayName: 'Image info & NVIDIA'
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
displayName: 'set visible devices'
- bash: |
set -e
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
@ -112,8 +118,9 @@ jobs:
- bash: |
set -e
echo $CUDA_VISIBLE_DEVICES
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
python requirements/pytorch/check-avail-strategies.py
python requirements/pytorch/check-avail-extras.py
displayName: 'Env details'

View File

@ -5,6 +5,15 @@
set -e
# export all args as env variables
for var in "$@"
do
echo "$var"
eval "export $var"
done
printenv
if [ -z "$AZP_URL" ]; then
echo 1>&2 "error: missing AZP_URL environment variable"
exit 1
@ -26,9 +35,9 @@ if [ -n "$AZP_WORK" ]; then
mkdir -p "$AZP_WORK"
fi
rm -rf /azp/agent
mkdir /azp/agent
cd /azp/agent
rm -rf /azp/agent-$AZP_AGENT_NAME
mkdir /azp/agent-$AZP_AGENT_NAME
cd /azp/agent-$AZP_AGENT_NAME
export AGENT_ALLOW_RUNASROOT="1"
@ -74,7 +83,7 @@ curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
source ./env.sh
print_header "3. Configuring Azure Pipelines agent..."
print_header "3. Configuring Azure Pipelines agent $AZP_AGENT_NAME..."
./config.sh --unattended \
--agent "${AZP_AGENT_NAME:-$(hostname)}" \

View File

@ -59,7 +59,7 @@ RUN pip uninstall pytorch-lightning -y
WORKDIR /azp
COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
COPY ./.azure/start.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/start.sh
ENTRYPOINT ["/usr/local/bin/start.sh"]

View File

@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
WORKDIR /azp
COPY ./dockers/ci-runner-ipu/start.sh /usr/local/bin/
COPY ./.azure/start.sh /usr/local/bin/
RUN curl -o /usr/local/bin/installdependencies.sh \
"https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \

View File

@ -1,96 +0,0 @@
#!/bin/bash
# This is a slightly modified version of the script from
# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
set -e
if [ -z "$AZP_URL" ]; then
echo 1>&2 "error: missing AZP_URL environment variable"
exit 1
fi
if [ -z "$AZP_TOKEN_FILE" ]; then
if [ -z "$AZP_TOKEN" ]; then
echo 1>&2 "error: missing AZP_TOKEN environment variable"
exit 1
fi
AZP_TOKEN_FILE=/azp/.token
echo -n $AZP_TOKEN > "$AZP_TOKEN_FILE"
fi
unset AZP_TOKEN
if [ -n "$AZP_WORK" ]; then
mkdir -p "$AZP_WORK"
fi
rm -rf /azp/agent
mkdir /azp/agent
cd /azp/agent
export AGENT_ALLOW_RUNASROOT="1"
cleanup() {
if [ -e config.sh ]; then
print_header "Cleanup. Removing Azure Pipelines agent..."
./config.sh remove --unattended \
--auth PAT \
--token $(cat "$AZP_TOKEN_FILE")
fi
}
print_header() {
lightcyan='\033[1;36m'
nocolor='\033[0m'
echo -e "${lightcyan}$1${nocolor}"
}
# Let the agent ignore the token env variables
export VSO_AGENT_IGNORE=AZP_TOKEN,AZP_TOKEN_FILE
print_header "1. Determining matching Azure Pipelines agent..."
AZP_AGENT_RESPONSE=$(curl -LsS \
-u user:$(cat "$AZP_TOKEN_FILE") \
-H 'Accept:application/json;api-version=3.0-preview' \
"$AZP_URL/_apis/distributedtask/packages/agent?platform=linux-x64")
if echo "$AZP_AGENT_RESPONSE" | jq . >/dev/null 2>&1; then
AZP_AGENTPACKAGE_URL=$(echo "$AZP_AGENT_RESPONSE" \
| jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]')
fi
if [ -z "$AZP_AGENTPACKAGE_URL" -o "$AZP_AGENTPACKAGE_URL" == "null" ]; then
echo 1>&2 "error: could not determine a matching Azure Pipelines agent - check that account '$AZP_URL' is correct and the token is valid for that account"
exit 1
fi
print_header "2. Downloading and installing Azure Pipelines agent..."
curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
source ./env.sh
print_header "3. Configuring Azure Pipelines agent..."
./config.sh --unattended \
--agent "${AZP_AGENT_NAME:-$(hostname)}" \
--url "$AZP_URL" \
--auth PAT \
--token $(cat "$AZP_TOKEN_FILE") \
--pool "${AZP_POOL:-Default}" \
--work "${AZP_WORK:-_work}" \
--replace \
--acceptTeeEula & wait $!
print_header "4. Running Azure Pipelines agent..."
trap 'cleanup; exit 130' INT
trap 'cleanup; exit 143' TERM
# To be aware of TERM and INT signals call run.sh
# Running it with the --once flag at the end will shut down the agent after the build is executed
./run.sh --once & wait $!