lightning/.github/workflows/tpu-tests.yml

176 lines
5.9 KiB
YAML

name: Test PyTorch - TPU
on:
push:
branches: [master, "release/*"]
pull_request_target:
branches: [master, "release/*"]
types: [opened, reopened, ready_for_review, labeled, synchronize]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
defaults:
run:
shell: bash
jobs:
test-on-tpus:
runs-on: ubuntu-22.04
# run only when the PR title contains 'TPU' or is a merge to master
if: |
(github.event_name == 'push' && github.ref == 'refs/heads/master') ||
(startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.labels.*.name, 'run TPU'))
strategy:
fail-fast: false
matrix:
pkg-name: ["fabric", "pytorch"]
accelerator_type: ["v4-8"]
timeout-minutes: 30
env:
XLA_VER: "2.0"
PR_NUMBER: ${{ github.event.pull_request.number && github.event.pull_request.number || 'master' }}
SHA: ${{ github.event.pull_request.head.sha && github.event.pull_request.head.sha || github.sha }}
CLOUDSDK_CORE_DISABLE_PROMPTS: 1 # default to --quiet
steps:
- name: Set env
run: |
# define --zone: https://cloud.google.com/tpu/docs/regions-zones
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
echo "CLOUDSDK_COMPUTE_ZONE=us-central2-b" >> $GITHUB_ENV
else
echo "CLOUDSDK_COMPUTE_ZONE=us-west4-a" >> $GITHUB_ENV
fi
- uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
- uses: actions/setup-python@v5
with:
python-version: "3.10"
- uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
- uses: "google-github-actions/setup-gcloud@v2"
- name: Time-based job cleanup
if: always()
run: |
gcloud compute tpus tpu-vm list --format='value(name,createTime)' > creation_times.txt
cat creation_times.txt
if [ ! -s "creation_times.txt" ]; then
echo "No existing jobs"
exit 0
fi
jobs_deleted=false
while read -r job_name created_at; do
# Skip jobs with "keepalive" in the name
if [[ "$job_name" == *"keepalive"* ]]; then
echo "Skipping $job_name, has keepalive in name"
continue
fi
# Convert the creation time to Unix timestamp
created_timestamp=$(date -d "${created_at}" +%s)
# Calculate the difference between the current time and the creation time
current_timestamp=$(date +%s)
age=$((current_timestamp - created_timestamp))
# Check if the age has surpassed a timeout
if ((age > 35 * 60)); then
# delete the job
gcloud compute tpus tpu-vm delete "$job_name" --async
jobs_deleted=true
else
echo "Skipping $job_name, alive for $age seconds"
fi
done < creation_times.txt
if [ "$jobs_deleted" = true ]; then
sleep 5
# diagnostics
gcloud compute tpus tpu-vm list
fi
- name: Update script
run: |
import os
fname = f'tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh'
with open(fname) as fopen:
data = fopen.read()
data = data.replace('{PYTORCH_VERSION}', os.environ["XLA_VER"])
print(data)
with open(fname, "w") as fopen:
fopen.write(data)
shell: python
- name: Create node
id: tpu-create
# TPU capacity is very limited so this workflow's success is optional. continue normally if creation fails
continue-on-error: true
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
run: |
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
gcloud compute tpus tpu-vm create "$JOB_NAME" \
--accelerator-type=${{ matrix.accelerator_type }} \
--version="tpu-vm-v4-pt-$XLA_VER" \
--preemptible
fi
- name: Run tests
if: steps.tpu-create.outcome == 'success'
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
run: |
set -uex
# zip-copy-unzip the repository
zip -q -r repo.zip . -x .git/
gcloud compute tpus tpu-vm scp --worker=all repo.zip "$JOB_NAME":~
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; unzip -q -o repo.zip"
# run script
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; bash tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh"
exit_code=$?
# pull out the coverage file
gcloud compute tpus tpu-vm scp "$JOB_NAME":~/coverage.xml .
exit $exit_code
- name: Cleanup job
if: always()
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
run: |
if ! gcloud compute tpus tpu-vm list | grep -q "$JOB_NAME"; then
echo "$JOB_NAME wasn't created"
exit 0
fi
# diagnostics
gcloud compute tpus tpu-vm describe "$JOB_NAME"
# delete the job
gcloud compute tpus tpu-vm delete "$JOB_NAME" --async
sleep 5
# diagnostics
gcloud compute tpus tpu-vm list
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: tpu,pytest
name: TPU-coverage
fail_ci_if_error: false