lightning/.azure/hpu-tests.yml

91 lines
2.9 KiB
YAML

# Pipeline to run the HPU tests in DL1 Instance
trigger:
tags:
include:
- '*'
branches:
include:
- "master"
- "release/*"
- "refs/tags/*"
pr:
- "master"
- "release/*"
jobs:
- job: testing
# how long to run the job before automatically cancelling
timeoutInMinutes: "10"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: habana-gaudi-hpus
container:
image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest"
options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro"
workspace:
clean: all
steps:
- script: |
/tmp/docker exec -t -u 0 cd-container \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
displayName: 'Install Sudo in container (thanks Microsoft!)'
- bash: |
sudo apt-get install -y hwinfo
hwinfo --short
python --version
sudo pip install pip -U
displayName: 'Instance HW info'
- bash: |
set -e
pip --version
sudo pip uninstall -y lightning pytorch-lightning
pip install fire
python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
pip install ".[extra,test]"
pip list
env:
PACKAGE_NAME: pytorch
FREEZE_REQUIREMENTS: 1
displayName: 'Install dependencies'
- bash: |
hl-smi -L
lsmod | grep habanalabs
displayName: 'Check the driver status'
- bash: |
python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml
workingDirectory: tests/tests_pytorch
displayName: 'Single card HPU test'
- bash: |
python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml
workingDirectory: tests/tests_pytorch
displayName: 'Multi card(8) HPU test'
- bash: |
python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \
'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \
'plugins/precision/hpu/ops_fp32.txt' --forked \
--junitxml=hpu1_precision_test-results.xml
workingDirectory: tests/tests_pytorch
displayName: 'HPU precision test'
- bash: |
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
python "pl_hpu/mnist_sample.py"
workingDirectory: examples
displayName: 'Testing: HPU examples'
- task: PublishTestResults@2
inputs:
testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()
displayName: 'Publish test results'