125 lines
4.1 KiB
YAML
125 lines
4.1 KiB
YAML
# Pipeline to run the HPU tests in DL1 Instance
|
|
|
|
trigger:
|
|
tags:
|
|
include:
|
|
- '*'
|
|
branches:
|
|
include:
|
|
- "master"
|
|
- "release/*"
|
|
- "refs/tags/*"
|
|
|
|
pr:
|
|
branches:
|
|
include:
|
|
- "master"
|
|
- "release/*"
|
|
paths:
|
|
include:
|
|
- ".actions/**"
|
|
- ".azure/hpu-tests.yml"
|
|
- "examples/pl_hpu/mnist_sample.py"
|
|
- "requirements/fabric/**"
|
|
- "src/lightning/fabric/**"
|
|
- "src/lightning_fabric/*"
|
|
- "requirements/pytorch/**"
|
|
- "src/lightning/pytorch/**"
|
|
- "src/pytorch_lightning/*"
|
|
- "tests/tests_pytorch/**"
|
|
- "pyproject.toml" # includes pytest config
|
|
exclude:
|
|
- "requirements/*/docs.txt"
|
|
- "*.md"
|
|
- "**/*.md"
|
|
|
|
jobs:
|
|
- job: testing
|
|
# how long to run the job before automatically cancelling
|
|
timeoutInMinutes: "10"
|
|
# how much time to give 'run always even if cancelled tasks' before stopping them
|
|
cancelTimeoutInMinutes: "2"
|
|
pool: intel-hpus
|
|
container:
|
|
image: "vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest"
|
|
options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g -v /usr/bin/docker:/tmp/docker:ro"
|
|
workspace:
|
|
clean: all
|
|
|
|
steps:
|
|
|
|
- bash: |
|
|
echo "##vso[task.setvariable variable=CONTAINER_ID]$(head -1 /proc/self/cgroup|cut -d/ -f3)"
|
|
displayName: 'Set environment variables'
|
|
|
|
- script: |
|
|
/tmp/docker exec -t -u 0 $CONTAINER_ID \
|
|
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
|
|
displayName: 'Install Sudo in container (thanks Microsoft!)'
|
|
|
|
- bash: |
|
|
sudo apt-get install -y hwinfo
|
|
hwinfo --short
|
|
python --version
|
|
sudo pip install pip -U
|
|
displayName: 'Instance HW info'
|
|
|
|
- bash: |
|
|
set -e
|
|
pip --version
|
|
sudo pip uninstall -y lightning pytorch-lightning
|
|
pip install -q -r .actions/requirements.txt
|
|
python .actions/assistant.py requirements_prune_pkgs "--packages=[torch,torchvision]"
|
|
pip install ".[extra,test]"
|
|
pip list
|
|
env:
|
|
PACKAGE_NAME: "pytorch"
|
|
FREEZE_REQUIREMENTS: "1"
|
|
displayName: 'Install package & dependencies'
|
|
|
|
- bash: |
|
|
hl-smi -L
|
|
lsmod | grep habanalabs
|
|
displayName: 'Check the driver status'
|
|
|
|
- bash: |
|
|
pip install -q -r .actions/requirements.txt
|
|
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \
|
|
--source_import="lightning.fabric,lightning.pytorch" \
|
|
--target_import="lightning_fabric,pytorch_lightning"
|
|
python .actions/assistant.py copy_replace_imports --source_dir="./examples/pl_hpu" \
|
|
--source_import="lightning.fabric,lightning.pytorch" \
|
|
--target_import="lightning_fabric,pytorch_lightning"
|
|
displayName: 'Adjust tests & examples'
|
|
|
|
- bash: |
|
|
python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml
|
|
workingDirectory: tests/tests_pytorch
|
|
displayName: 'Single card HPU test'
|
|
|
|
- bash: |
|
|
python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml
|
|
workingDirectory: tests/tests_pytorch
|
|
displayName: 'Multi card(8) HPU test'
|
|
|
|
- bash: |
|
|
python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \
|
|
'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \
|
|
'plugins/precision/hpu/ops_fp32.txt' --forked \
|
|
--junitxml=hpu1_precision_test-results.xml
|
|
workingDirectory: tests/tests_pytorch
|
|
displayName: 'HPU precision test'
|
|
|
|
- bash: |
|
|
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
|
|
python "pl_hpu/mnist_sample.py"
|
|
workingDirectory: examples
|
|
displayName: 'Testing: HPU examples'
|
|
|
|
- task: PublishTestResults@2
|
|
inputs:
|
|
testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml'
|
|
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
|
|
condition: succeededOrFailed()
|
|
displayName: 'Publish test results'
|