CI: validate JSON & fix benchmark (#8567)

* CI: validate JSON * as GHA * PT1.8 * 32g Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
2021-07-28 18:09:15 +02:00 · 2021-07-28 18:09:15 +02:00 · 470842f5c8
parent 0a71fe2859
commit 470842f5c8
5 changed files with 51 additions and 67 deletions
--- a/.azure-pipelines/gpu-benchmark.yml
+++ b/.azure-pipelines/gpu-benchmark.yml
@ -1,21 +1,19 @@
-name: GPU Parity testing
+schedules:
-
+  - cron: "0 0 * * *" # At the end of every day
-on:
+    displayName: Daily midnight benchmark
-  schedule:
+    branches:
-    - cron: "0 0 * * *" # At the end of every day
+      include:
        - "master"
 jobs:
-  parity-test:
+  - job: benchmarks
-    timeoutInMinutes: 120
+    timeoutInMinutes: "90"
-
+    cancelTimeoutInMinutes: "2"
    cancelTimeoutInMinutes: 2
    pool: gridai-spot-pool
    container:
      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.8"
-
+      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
    workspace:
      clean: all
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@ -9,19 +9,19 @@ trigger:
      - '*'
  branches:
    include:
-      - master
+      - "master"
-      - release/*
+      - "release/*"
-      - refs/tags/*
+      - "refs/tags/*"
 pr:
-  - master
+  - "master"
-  - release/*
+  - "release/*"
 jobs:
  - job: pytest
    # how long to run the job before automatically cancelling
-    timeoutInMinutes: 45
+    timeoutInMinutes: "45"
    # how much time to give 'run always even if cancelled tasks' before stopping them
-    cancelTimeoutInMinutes: 2
+    cancelTimeoutInMinutes: "2"
    pool: gridai-spot-pool
@ -92,14 +92,15 @@ jobs:
        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
      condition: succeededOrFailed()
-    - task: PublishCodeCoverageResults@1
+    # todo: re-enable after schema check pass, also atm it seems does not have any effect
-      displayName: 'Publish coverage report'
+    #- task: PublishCodeCoverageResults@2
-      inputs:
+    #  displayName: 'Publish coverage report'
-        codeCoverageTool: 'cobertura'
+    #  inputs:
-        summaryFileLocation: 'coverage.xml'
+    #    codeCoverageTool: 'Cobertura'
-        reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
+    #    summaryFileLocation: 'coverage.xml'
-        testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
+    #    reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
-      condition: succeededOrFailed()
+    #    testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
    #  condition: succeededOrFailed()
    - script: |
        set -e
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@ -123,21 +123,6 @@ jobs:
          push: false
        timeout-minutes: 50
  build-nvidia:
    runs-on: ubuntu-20.04
    # todo: temporarily skip as the base container does not fit to agent
    if: false
    steps:
      - name: Checkout
        uses: actions/checkout@v2
      - name: Build NVIDIA Docker
        uses: docker/build-push-action@v2
        with:
          file: dockers/nvidia/Dockerfile
          push: false
        timeout-minutes: 50
  build-ipu:
    runs-on: ubuntu-20.04
    strategy:
--- a/.github/workflows/ci_schema.yml
+++ b/.github/workflows/ci_schema.yml
@ -0,0 +1,24 @@
 name: CI action schema
 on: # Trigger the workflow on push or pull request, but only for the master branch
  push: {}
  pull_request:
    branches: [master, "release/*"]
 jobs:
  validate-schema:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v2
      - name: Install pkg
        run: |
          pip install check-jsonschema
      - name: GH Workflows
        run: |
          check-jsonschema .github/workflows/*.yml --schemafile "https://json.schemastore.org/github-workflow"
      - name: Azure Pipelines
        run: |
          check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@ -153,30 +153,6 @@ jobs:
          tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
        timeout-minutes: 55
  docker-NVIDIA:
    runs-on: ubuntu-20.04
    # todo: temporarily skip as the base container does not fit to agent
    if: false
    steps:
      - name: Checkout
        uses: actions/checkout@v2
      # https://github.com/docker/setup-buildx-action
      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
      - uses: docker/setup-buildx-action@v1
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - name: Publish NVIDIA to Docker Hub
        uses: docker/build-push-action@v2
        with:
          file: dockers/nvidia/Dockerfile
          tags: nvcr.io/pytorchlightning/pytorch_lightning:latest
        timeout-minutes: 55
  docker-IPU:
    runs-on: ubuntu-20.04
    strategy: