CI: Use self-hosted Azure GPU runners (#14632)
* move config * Apply suggestions from code review Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com> Co-authored-by: otaj <6065855+otaj@users.noreply.github.com>
This commit is contained in:
parent
0a9fc22b4f
commit
5f106957f7
|
@ -0,0 +1,46 @@
|
|||
# Creation GPU self-hosted agent pool
|
||||
|
||||
## Prepare the machine
|
||||
|
||||
This is a slightly modified version of the script from
|
||||
https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
|
||||
|
||||
```bash
|
||||
apt-get update
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
curl \
|
||||
jq \
|
||||
git \
|
||||
iputils-ping \
|
||||
libcurl4 \
|
||||
libunwind8 \
|
||||
netcat \
|
||||
libssl1.0
|
||||
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
mkdir /azp
|
||||
```
|
||||
|
||||
## Stating the agents
|
||||
|
||||
```bash
|
||||
export TARGETARCH=linux-x64
|
||||
export AZP_URL="https://dev.azure.com/Lightning-AI"
|
||||
export AZP_TOKEN="xxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
export AZP_POOL="lit-rtx-3090"
|
||||
|
||||
for i in {0..7..2}
|
||||
do
|
||||
nohup bash .azure/start.sh \
|
||||
"AZP_AGENT_NAME=litGPU-YX_$i,$((i+1))" \
|
||||
"CUDA_VISIBLE_DEVICES=$i,$((i+1))" \
|
||||
> "agent-$i.log" &
|
||||
done
|
||||
```
|
||||
|
||||
## Check running agents
|
||||
|
||||
```bash
|
||||
ps aux | grep start.sh
|
||||
```
|
|
@ -41,12 +41,14 @@ jobs:
|
|||
timeoutInMinutes: "20"
|
||||
# how much time to give 'run always even if cancelled tasks' before stopping them
|
||||
cancelTimeoutInMinutes: "2"
|
||||
pool: azure-jirka-spot
|
||||
pool: lit-rtx-3090
|
||||
variables:
|
||||
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
||||
container:
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
|
||||
# default shm size is 64m. Increase it to avoid:
|
||||
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
|
||||
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
|
||||
options: "--gpus=all --shm-size=2gb"
|
||||
workspace:
|
||||
clean: all
|
||||
|
||||
|
@ -61,6 +63,10 @@ jobs:
|
|||
pip list
|
||||
displayName: 'Image info & NVIDIA'
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
|
||||
displayName: 'set visible devices'
|
||||
|
||||
- bash: |
|
||||
set -e
|
||||
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
|
||||
|
@ -78,8 +84,9 @@ jobs:
|
|||
|
||||
- bash: |
|
||||
set -e
|
||||
echo $CUDA_VISIBLE_DEVICES
|
||||
python requirements/collect_env_details.py
|
||||
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
|
||||
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
|
||||
displayName: 'Env details'
|
||||
|
||||
- bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
|
||||
|
|
|
@ -67,12 +67,14 @@ jobs:
|
|||
timeoutInMinutes: "80"
|
||||
# how much time to give 'run always even if cancelled tasks' before stopping them
|
||||
cancelTimeoutInMinutes: "2"
|
||||
pool: azure-jirka-spot
|
||||
pool: lit-rtx-3090
|
||||
variables:
|
||||
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
||||
container:
|
||||
image: $(image)
|
||||
# default shm size is 64m. Increase it to avoid:
|
||||
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
|
||||
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
|
||||
options: "--gpus=all --shm-size=2gb"
|
||||
workspace:
|
||||
clean: all
|
||||
|
||||
|
@ -87,6 +89,10 @@ jobs:
|
|||
pip list
|
||||
displayName: 'Image info & NVIDIA'
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
|
||||
displayName: 'set visible devices'
|
||||
|
||||
- bash: |
|
||||
set -e
|
||||
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
|
||||
|
@ -112,8 +118,9 @@ jobs:
|
|||
|
||||
- bash: |
|
||||
set -e
|
||||
echo $CUDA_VISIBLE_DEVICES
|
||||
python requirements/collect_env_details.py
|
||||
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
|
||||
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
|
||||
python requirements/pytorch/check-avail-strategies.py
|
||||
python requirements/pytorch/check-avail-extras.py
|
||||
displayName: 'Env details'
|
||||
|
|
|
@ -5,6 +5,15 @@
|
|||
|
||||
set -e
|
||||
|
||||
# export all args as env variables
|
||||
for var in "$@"
|
||||
do
|
||||
echo "$var"
|
||||
eval "export $var"
|
||||
done
|
||||
|
||||
printenv
|
||||
|
||||
if [ -z "$AZP_URL" ]; then
|
||||
echo 1>&2 "error: missing AZP_URL environment variable"
|
||||
exit 1
|
||||
|
@ -26,9 +35,9 @@ if [ -n "$AZP_WORK" ]; then
|
|||
mkdir -p "$AZP_WORK"
|
||||
fi
|
||||
|
||||
rm -rf /azp/agent
|
||||
mkdir /azp/agent
|
||||
cd /azp/agent
|
||||
rm -rf /azp/agent-$AZP_AGENT_NAME
|
||||
mkdir /azp/agent-$AZP_AGENT_NAME
|
||||
cd /azp/agent-$AZP_AGENT_NAME
|
||||
|
||||
export AGENT_ALLOW_RUNASROOT="1"
|
||||
|
||||
|
@ -74,7 +83,7 @@ curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
|
|||
|
||||
source ./env.sh
|
||||
|
||||
print_header "3. Configuring Azure Pipelines agent..."
|
||||
print_header "3. Configuring Azure Pipelines agent $AZP_AGENT_NAME..."
|
||||
|
||||
./config.sh --unattended \
|
||||
--agent "${AZP_AGENT_NAME:-$(hostname)}" \
|
|
@ -59,7 +59,7 @@ RUN pip uninstall pytorch-lightning -y
|
|||
|
||||
WORKDIR /azp
|
||||
|
||||
COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
|
||||
COPY ./.azure/start.sh /usr/local/bin/
|
||||
RUN chmod +x /usr/local/bin/start.sh
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/start.sh"]
|
||||
|
|
|
@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
|||
|
||||
WORKDIR /azp
|
||||
|
||||
COPY ./dockers/ci-runner-ipu/start.sh /usr/local/bin/
|
||||
COPY ./.azure/start.sh /usr/local/bin/
|
||||
|
||||
RUN curl -o /usr/local/bin/installdependencies.sh \
|
||||
"https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \
|
||||
|
|
|
@ -1,96 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This is a slightly modified version of the script from
|
||||
# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
|
||||
|
||||
set -e
|
||||
|
||||
if [ -z "$AZP_URL" ]; then
|
||||
echo 1>&2 "error: missing AZP_URL environment variable"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$AZP_TOKEN_FILE" ]; then
|
||||
if [ -z "$AZP_TOKEN" ]; then
|
||||
echo 1>&2 "error: missing AZP_TOKEN environment variable"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AZP_TOKEN_FILE=/azp/.token
|
||||
echo -n $AZP_TOKEN > "$AZP_TOKEN_FILE"
|
||||
fi
|
||||
|
||||
unset AZP_TOKEN
|
||||
|
||||
if [ -n "$AZP_WORK" ]; then
|
||||
mkdir -p "$AZP_WORK"
|
||||
fi
|
||||
|
||||
rm -rf /azp/agent
|
||||
mkdir /azp/agent
|
||||
cd /azp/agent
|
||||
|
||||
export AGENT_ALLOW_RUNASROOT="1"
|
||||
|
||||
cleanup() {
|
||||
if [ -e config.sh ]; then
|
||||
print_header "Cleanup. Removing Azure Pipelines agent..."
|
||||
|
||||
./config.sh remove --unattended \
|
||||
--auth PAT \
|
||||
--token $(cat "$AZP_TOKEN_FILE")
|
||||
fi
|
||||
}
|
||||
|
||||
print_header() {
|
||||
lightcyan='\033[1;36m'
|
||||
nocolor='\033[0m'
|
||||
echo -e "${lightcyan}$1${nocolor}"
|
||||
}
|
||||
|
||||
# Let the agent ignore the token env variables
|
||||
export VSO_AGENT_IGNORE=AZP_TOKEN,AZP_TOKEN_FILE
|
||||
|
||||
print_header "1. Determining matching Azure Pipelines agent..."
|
||||
|
||||
AZP_AGENT_RESPONSE=$(curl -LsS \
|
||||
-u user:$(cat "$AZP_TOKEN_FILE") \
|
||||
-H 'Accept:application/json;api-version=3.0-preview' \
|
||||
"$AZP_URL/_apis/distributedtask/packages/agent?platform=linux-x64")
|
||||
|
||||
if echo "$AZP_AGENT_RESPONSE" | jq . >/dev/null 2>&1; then
|
||||
AZP_AGENTPACKAGE_URL=$(echo "$AZP_AGENT_RESPONSE" \
|
||||
| jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]')
|
||||
fi
|
||||
|
||||
if [ -z "$AZP_AGENTPACKAGE_URL" -o "$AZP_AGENTPACKAGE_URL" == "null" ]; then
|
||||
echo 1>&2 "error: could not determine a matching Azure Pipelines agent - check that account '$AZP_URL' is correct and the token is valid for that account"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_header "2. Downloading and installing Azure Pipelines agent..."
|
||||
|
||||
curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
|
||||
|
||||
source ./env.sh
|
||||
|
||||
print_header "3. Configuring Azure Pipelines agent..."
|
||||
|
||||
./config.sh --unattended \
|
||||
--agent "${AZP_AGENT_NAME:-$(hostname)}" \
|
||||
--url "$AZP_URL" \
|
||||
--auth PAT \
|
||||
--token $(cat "$AZP_TOKEN_FILE") \
|
||||
--pool "${AZP_POOL:-Default}" \
|
||||
--work "${AZP_WORK:-_work}" \
|
||||
--replace \
|
||||
--acceptTeeEula & wait $!
|
||||
|
||||
print_header "4. Running Azure Pipelines agent..."
|
||||
|
||||
trap 'cleanup; exit 130' INT
|
||||
trap 'cleanup; exit 143' TERM
|
||||
|
||||
# To be aware of TERM and INT signals call run.sh
|
||||
# Running it with the --once flag at the end will shut down the agent after the build is executed
|
||||
./run.sh --once & wait $!
|
Loading…
Reference in New Issue