176 lines
5.9 KiB
YAML
176 lines
5.9 KiB
YAML
name: Test PyTorch - TPU
|
|
|
|
on:
|
|
push:
|
|
branches: [master, "release/*"]
|
|
pull_request_target:
|
|
branches: [master, "release/*"]
|
|
types: [opened, reopened, ready_for_review, labeled, synchronize]
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
|
|
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
|
|
|
defaults:
|
|
run:
|
|
shell: bash
|
|
|
|
jobs:
|
|
test-on-tpus:
|
|
runs-on: ubuntu-22.04
|
|
# run only when the PR title contains 'TPU' or is a merge to master
|
|
if: |
|
|
(github.event_name == 'push' && github.ref == 'refs/heads/master') ||
|
|
(startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.labels.*.name, 'run TPU'))
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
pkg-name: ["fabric", "pytorch"]
|
|
accelerator_type: ["v4-8"]
|
|
timeout-minutes: 30
|
|
env:
|
|
XLA_VER: "2.0"
|
|
PR_NUMBER: ${{ github.event.pull_request.number && github.event.pull_request.number || 'master' }}
|
|
SHA: ${{ github.event.pull_request.head.sha && github.event.pull_request.head.sha || github.sha }}
|
|
CLOUDSDK_CORE_DISABLE_PROMPTS: 1 # default to --quiet
|
|
steps:
|
|
- name: Set env
|
|
run: |
|
|
# define --zone: https://cloud.google.com/tpu/docs/regions-zones
|
|
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
|
|
echo "CLOUDSDK_COMPUTE_ZONE=us-central2-b" >> $GITHUB_ENV
|
|
else
|
|
echo "CLOUDSDK_COMPUTE_ZONE=us-west4-a" >> $GITHUB_ENV
|
|
fi
|
|
|
|
- uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ github.event.pull_request.head.sha }}
|
|
- uses: actions/setup-python@v5
|
|
with:
|
|
python-version: "3.10"
|
|
|
|
- uses: google-github-actions/auth@v2
|
|
with:
|
|
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
|
|
- uses: "google-github-actions/setup-gcloud@v2"
|
|
|
|
- name: Time-based job cleanup
|
|
if: always()
|
|
run: |
|
|
gcloud compute tpus tpu-vm list --format='value(name,createTime)' > creation_times.txt
|
|
cat creation_times.txt
|
|
|
|
if [ ! -s "creation_times.txt" ]; then
|
|
echo "No existing jobs"
|
|
exit 0
|
|
fi
|
|
|
|
jobs_deleted=false
|
|
while read -r job_name created_at; do
|
|
# Skip jobs with "keepalive" in the name
|
|
if [[ "$job_name" == *"keepalive"* ]]; then
|
|
echo "Skipping $job_name, has keepalive in name"
|
|
continue
|
|
fi
|
|
|
|
# Convert the creation time to Unix timestamp
|
|
created_timestamp=$(date -d "${created_at}" +%s)
|
|
|
|
# Calculate the difference between the current time and the creation time
|
|
current_timestamp=$(date +%s)
|
|
age=$((current_timestamp - created_timestamp))
|
|
|
|
# Check if the age has surpassed a timeout
|
|
if ((age > 35 * 60)); then
|
|
# delete the job
|
|
gcloud compute tpus tpu-vm delete "$job_name" --async
|
|
jobs_deleted=true
|
|
else
|
|
echo "Skipping $job_name, alive for $age seconds"
|
|
fi
|
|
done < creation_times.txt
|
|
|
|
if [ "$jobs_deleted" = true ]; then
|
|
sleep 5
|
|
# diagnostics
|
|
gcloud compute tpus tpu-vm list
|
|
fi
|
|
|
|
- name: Update script
|
|
run: |
|
|
import os
|
|
fname = f'tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh'
|
|
with open(fname) as fopen:
|
|
data = fopen.read()
|
|
data = data.replace('{PYTORCH_VERSION}', os.environ["XLA_VER"])
|
|
print(data)
|
|
with open(fname, "w") as fopen:
|
|
fopen.write(data)
|
|
shell: python
|
|
|
|
- name: Create node
|
|
id: tpu-create
|
|
# TPU capacity is very limited so this workflow's success is optional. continue normally if creation fails
|
|
continue-on-error: true
|
|
env:
|
|
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
|
|
run: |
|
|
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
|
|
gcloud compute tpus tpu-vm create "$JOB_NAME" \
|
|
--accelerator-type=${{ matrix.accelerator_type }} \
|
|
--version="tpu-vm-v4-pt-$XLA_VER" \
|
|
--preemptible
|
|
fi
|
|
|
|
- name: Run tests
|
|
if: steps.tpu-create.outcome == 'success'
|
|
env:
|
|
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
|
|
run: |
|
|
set -uex
|
|
|
|
# zip-copy-unzip the repository
|
|
zip -q -r repo.zip . -x .git/
|
|
gcloud compute tpus tpu-vm scp --worker=all repo.zip "$JOB_NAME":~
|
|
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; unzip -q -o repo.zip"
|
|
|
|
# run script
|
|
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; bash tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh"
|
|
exit_code=$?
|
|
|
|
# pull out the coverage file
|
|
gcloud compute tpus tpu-vm scp "$JOB_NAME":~/coverage.xml .
|
|
|
|
exit $exit_code
|
|
|
|
- name: Cleanup job
|
|
if: always()
|
|
env:
|
|
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
|
|
run: |
|
|
if ! gcloud compute tpus tpu-vm list | grep -q "$JOB_NAME"; then
|
|
echo "$JOB_NAME wasn't created"
|
|
exit 0
|
|
fi
|
|
|
|
# diagnostics
|
|
gcloud compute tpus tpu-vm describe "$JOB_NAME"
|
|
|
|
# delete the job
|
|
gcloud compute tpus tpu-vm delete "$JOB_NAME" --async
|
|
sleep 5
|
|
|
|
# diagnostics
|
|
gcloud compute tpus tpu-vm list
|
|
|
|
- name: Upload coverage to Codecov
|
|
uses: codecov/codecov-action@v4
|
|
continue-on-error: true
|
|
with:
|
|
token: ${{ secrets.CODECOV_TOKEN }}
|
|
file: coverage.xml
|
|
flags: tpu,pytest
|
|
name: TPU-coverage
|
|
fail_ci_if_error: false
|