# Pipeline to run the HPU tests in DL1 Instance trigger: tags: include: - '*' branches: include: - "master" - "release/*" - "refs/tags/*" pr: branches: include: - "master" - "release/*" paths: include: - ".actions/**" - ".azure/hpu-tests.yml" - "examples/pl_hpu/mnist_sample.py" - "requirements/fabric/**" - "src/lightning/fabric/**" - "src/lightning_fabric/*" - "requirements/pytorch/**" - "src/lightning/pytorch/**" - "src/pytorch_lightning/*" - "tests/tests_pytorch/**" - "pyproject.toml" # includes pytest config exclude: - "requirements/*/docs.txt" - "*.md" - "**/*.md" jobs: - job: testing # how long to run the job before automatically cancelling timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" pool: intel-hpus container: image: "vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all steps: - bash: | echo "##vso[task.setvariable variable=CONTAINER_ID]$(head -1 /proc/self/cgroup|cut -d/ -f3)" displayName: 'Set environment variables' - script: | /tmp/docker exec -t -u 0 $CONTAINER_ID \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" displayName: 'Install Sudo in container (thanks Microsoft!)' - bash: | sudo apt-get install -y hwinfo hwinfo --short python --version sudo pip install pip -U displayName: 'Instance HW info' - bash: | set -e pip --version sudo pip uninstall -y lightning pytorch-lightning pip install -q -r .actions/requirements.txt python .actions/assistant.py requirements_prune_pkgs "--packages=[torch,torchvision]" pip install ".[extra,test]" pip list env: PACKAGE_NAME: "pytorch" FREEZE_REQUIREMENTS: "1" displayName: 'Install package & dependencies' - bash: | hl-smi -L lsmod | grep habanalabs displayName: 'Check the driver status' - bash: | pip install -q -r .actions/requirements.txt python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \ --source_import="lightning.fabric,lightning.pytorch" \ --target_import="lightning_fabric,pytorch_lightning" python .actions/assistant.py copy_replace_imports --source_dir="./examples/pl_hpu" \ --source_import="lightning.fabric,lightning.pytorch" \ --target_import="lightning_fabric,pytorch_lightning" displayName: 'Adjust tests & examples' - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ 'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \ 'plugins/precision/hpu/ops_fp32.txt' --forked \ --junitxml=hpu1_precision_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'HPU precision test' - bash: | export PYTHONPATH="${PYTHONPATH}:$(pwd)" python "pl_hpu/mnist_sample.py" workingDirectory: examples displayName: 'Testing: HPU examples' - task: PublishTestResults@2 inputs: testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml' testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' condition: succeededOrFailed() displayName: 'Publish test results'