From d9aa833628d04d7309f5843b0b10add94932cb9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 7 Nov 2023 04:13:20 +0100
Subject: [PATCH] Add more CUDA card FLOPs (#18958)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: awaelchli <aedu.waelchli@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 src/lightning/fabric/utilities/throughput.py  | 250 +++++++++++++++---
 .../tests_fabric/utilities/test_throughput.py |  38 ++-
 2 files changed, 247 insertions(+), 41 deletions(-)

diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py
index 44841c0039..cd81f53153 100644
--- a/src/lightning/fabric/utilities/throughput.py
+++ b/src/lightning/fabric/utilities/throughput.py
@@ -305,44 +305,204 @@ def measure_flops(
 
 
 _CUDA_FLOPS: Dict[str, Dict[Union[str, torch.dtype], float]] = {
-    # source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
-    # nvidia publishes spec sheet with a 2x sparsity factor
-    "h100-sxm": {
+    # Hopper
+    # source: https://resources.nvidia.com/en-us-tensor-core
+    "h100 nvl": {
         torch.float64: 67e12,
-        torch.float32: 67e12,
-        "tfloat32": 989e12 / 2,
-        torch.bfloat16: 1.979e15 / 2,
-        torch.float16: 1.979e15 / 2,
-        torch.int8: 3.958e15 / 2,
+        torch.float32: 133.8e12,
+        "tfloat32": 989.4e12,
+        torch.bfloat16: 1978.8e12,
+        torch.float16: 1978.8e12,
+        torch.int8: 3957.8e12,
     },
-    "h100-pcie": {
-        torch.float64: 51e12,
-        torch.float32: 51e12,
-        "tfloat32": 756e12 / 2,
-        torch.bfloat16: 1.513e15 / 2,
-        torch.float16: 1.513e15 / 2,
-        torch.int8: 3.026e15 / 2,
+    "h100 sxm": {
+        torch.float64: 33.5e12,
+        torch.float32: 66.9e12,
+        "tfloat32": 494.7e12,
+        torch.bfloat16: 989.4e12,
+        torch.float16: 989.4e12,
+        torch.int8: 1978.9e12,
     },
+    "h100 pcie": {
+        torch.float64: 25.6e12,
+        torch.float32: 51.2e12,
+        "tfloat32": 378e12,
+        torch.bfloat16: 756e12,
+        torch.float16: 756e12,
+        torch.int8: 1513e12,
+    },
+    # Ada
+    # source: https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf
+    "rtx 4090": {
+        torch.float32: 82.6e12,
+        "tfloat32": 82.6e12,
+        torch.bfloat16: 82.6e12,
+        torch.float16: 82.6e12,
+        torch.int8: 660.6e12,
+        "int4": 1321.2e12,
+    },
+    "rtx 4080": {
+        torch.float32: 48.7e12,
+        "tfloat32": 48.7e12,
+        torch.bfloat16: 48.7e12,
+        torch.float16: 48.7e12,
+        torch.int8: 389.9e12,
+        "int4": 779.8e12,
+    },
+    "l4": {
+        torch.float32: 30.3e12,
+        "tfloat32": 60e12,
+        torch.bfloat16: 121e12,
+        torch.float16: 121e12,
+        torch.int8: 242e12,
+        "int4": 484e12,
+    },
+    "l40": {
+        torch.float32: 90.5e12,
+        "tfloat32": 90.5e12,
+        torch.bfloat16: 181e12,
+        torch.float16: 181e12,
+        torch.int8: 362e12,
+        "int4": 724e12,
+    },
+    # Ampere
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
     # sxm and pcie have same flop counts
     "a100": {
-        torch.float64: 19.5e12,
+        torch.float64: 9.7e12,
         torch.float32: 19.5e12,
         "tfloat32": 156e12,
         torch.bfloat16: 312e12,
         torch.float16: 312e12,
+        torch.int8: 624e12,
+    },
+    "a6000": {
+        torch.float32: 38.7e12,
+        "tfloat32": 77.4e12,
+        torch.bfloat16: 38.7e12,
+        torch.float16: 38.7e12,
+        torch.int8: 309.7e12,
+        "int4": 619.3e12,
+    },
+    "a40": {
+        torch.float32: 37.4e12,
+        "tfloat32": 74.8e12,
+        torch.bfloat16: 37.4e12,
+        torch.float16: 37.4e12,
+        torch.int8: 299.3e12,
+        "int4": 598.7e12,
     },
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf
-    "a10g": {torch.float32: 31.2e12, "tfloat32": 62.5e12, torch.bfloat16: 125e12, torch.float16: 125e12},
-    # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
-    "v100-sxm": {torch.float64: 7.8e12, torch.float32: 15.7e12, torch.float16: 125e12},
-    "v100-pcie": {torch.float64: 7e12, torch.float32: 14e12, torch.float16: 112e12},
-    "v100s-pcie": {torch.float64: 8.2e12, torch.float32: 16.4e12, torch.float16: 130e12},
+    "a10g": {
+        torch.float32: 31.2e12,
+        "tfloat32": 62.5e12,
+        torch.bfloat16: 125e12,
+        torch.float16: 125e12,
+        torch.int8: 250e12,
+        "int4": 500e12,
+    },
+    "rtx 3090 ti": {
+        torch.float32: 40e12,
+        "tfloat32": 40e12,
+        torch.bfloat16: 40e12,
+        torch.float16: 40e12,
+        torch.int8: 320e12,
+        "int4": 640e12,
+    },
+    "rtx 3090": {
+        torch.float32: 35.6e12,
+        "tfloat32": 35.6e12,
+        torch.bfloat16: 35.6e12,
+        torch.float16: 35.6e12,
+        torch.int8: 284e12,
+        "int4": 568e12,
+    },
+    "rtx 3080 ti": {
+        torch.float32: 34.1e12,
+        "tfloat32": 34.1e12,
+        torch.bfloat16: 34.1e12,
+        torch.float16: 34.1e12,
+        torch.int8: 272.8e12,
+        "int4": 546.6e12,
+    },
+    "rtx 3080": {
+        torch.float32: 29.8e12,
+        "tfloat32": 29.8e12,
+        torch.bfloat16: 29.8e12,
+        torch.float16: 29.8e12,
+        torch.int8: 238e12,
+        "int4": 476e12,
+    },
+    "rtx 3070": {
+        torch.float32: 20.3e12,
+        "tfloat32": 20.3e12,
+        torch.bfloat16: 20.3e12,
+        torch.float16: 20.3e12,
+        torch.int8: 162.6e12,
+        "int4": 325.2e12,
+    },
+    # Turing
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf
     # sxm and pcie have same flop counts
-    "t4": {torch.float32: 8.1e12, torch.float16: 65e12, torch.int8: 130e12},
+    "t4": {
+        torch.float32: 8.1e12,
+        torch.float16: 65e12,
+        torch.int8: 130e12,
+        "int4": 260e12,
+    },
     # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf
-    "quadro rtx 5000": {torch.float32: 11.2e12, torch.float16: 89.2e12},
+    "quadro rtx 5000": {
+        torch.float32: 11.2e12,
+        torch.float16: 89.2e12,
+    },
+    "rtx 2080 super": {
+        torch.float32: 11.2e12,
+        torch.float16: 22.3e12,
+        torch.int8: 178.4e12,
+        "int4": 356.8e12,
+    },
+    "rtx 2080 ti": {
+        torch.float32: 14.2e12,
+        torch.float16: 28.5e12,
+        torch.int8: 227.7e12,
+        "int4": 455.4e12,
+    },
+    "rtx 2080": {
+        torch.float32: 10.6e12,
+        torch.float16: 21.2e12,
+        torch.int8: 169.6e12,
+        "int4": 339.1e12,
+    },
+    # https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
+    "rtx 2070 super": {
+        torch.float32: 9.1e12,
+        torch.float16: 18.1e12,
+        torch.int8: 145e12,
+        "int4": 290e12,
+    },
+    "titan rtx": {
+        torch.float32: 16.3e12,
+        torch.float16: 32.6e12,
+        torch.int8: 261e12,
+        "int4": 522e12,
+    },
+    # Volta
+    # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
+    "v100 sxm": {
+        torch.float64: 7.8e12,
+        torch.float32: 15.7e12,
+        torch.float16: 125e12,
+    },
+    "v100 pcie": {
+        torch.float64: 7e12,
+        torch.float32: 14e12,
+        torch.float16: 112e12,
+    },
+    "v100s pcie": {
+        torch.float64: 8.2e12,
+        torch.float32: 16.4e12,
+        torch.float16: 130e12,
+    },
 }
 
 _TPU_FLOPS = {
@@ -370,31 +530,51 @@ def get_available_flops(device: torch.device, dtype: Union[torch.dtype, str]) ->
     if device.type == "cuda":
         device_name = torch.cuda.get_device_name(device)
         chip = device_name.lower()
-        if "h100" in chip and "hbm3" in chip:
-            chip = "h100-sxm"
-        elif "h100" in chip and ("pcie" in chip or "hbm2e" in chip):
-            chip = "h100-pcie"
+        if "h100" in chip:
+            if "hbm3" in chip:
+                chip = "h100 sxm"
+            elif "nvl" in chip:
+                chip = "h100 nvl"
+            elif "pcie" in chip or "hbm2e" in chip:
+                chip = "h100 pcie"
+        elif "l4" in chip:
+            chip = "l40" if "tesla" in chip else "l4"
+        elif "geforce rtx" in chip:
+            number = chip.split(" ")[3]
+            extra = ""
+            if "super" in chip:
+                extra = " super"
+            elif "ti" in chip:
+                extra = " ti"
+            chip = f"rtx {number}{extra}"
+        elif "a6000" in chip:
+            chip = "a6000"
         elif "a100" in chip:
             chip = "a100"
+        elif "a40" in chip:
+            chip = "a40"
         elif "a10g" in chip:
             chip = "a10g"
-        elif "v100-sxm" in chip:
-            chip = "v100-sxm"
-        elif "v100-pcie" in chip:
-            chip = "v100-pcie"
-        elif "v100s-pcie" in chip:
-            chip = "v100s-pcie"
         elif "t4" in chip:
             chip = "t4"
         elif "quadro rtx 5000" in chip:
             chip = "quadro rtx 5000"
+        elif "titan rtx" in chip:
+            chip = "titan rtx"
+        elif "v100-sxm" in chip:
+            chip = "v100 sxm"
+        elif "v100-pcie" in chip:
+            chip = "v100 pcie"
+        elif "v100s-pcie" in chip:
+            chip = "v100s pcie"
         else:
             # the flops list is not exhaustive, return with a warning
             rank_zero_warn(f"FLOPs not found for {device_name!r}")
             return None
         if chip not in _CUDA_FLOPS:
-            # if we were able to parse the chip, it should be in the flops list
-            raise RuntimeError(f"FLOPs not found for {device_name!r}, chip is {chip!r}")
+            # parsing is implemented but we don't have the stats
+            rank_zero_warn(f"FLOPs not found for {device_name!r}, chip is {chip!r}")
+            return None
         dtype_to_flops = _CUDA_FLOPS[chip]
         if dtype is torch.float32:
             from lightning.fabric.accelerators.cuda import _is_ampere_or_later
diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py
index 06e63706d5..55d35e8707 100644
--- a/tests/tests_fabric/utilities/test_throughput.py
+++ b/tests/tests_fabric/utilities/test_throughput.py
@@ -36,7 +36,7 @@ def test_measure_flops():
 def test_get_available_flops(xla_available):
     with mock.patch("torch.cuda.get_device_name", return_value="NVIDIA H100 PCIe"):
         flops = get_available_flops(torch.device("cuda"), torch.bfloat16)
-    assert flops == 1.513e15 / 2
+    assert flops == 756e12
 
     with pytest.warns(match="not found for 'CocoNut"), mock.patch("torch.cuda.get_device_name", return_value="CocoNut"):
         assert get_available_flops(torch.device("cuda"), torch.bfloat16) is None
@@ -64,18 +64,44 @@ def test_get_available_flops(xla_available):
 @pytest.mark.parametrize(
     "device_name",
     [
-        # TODO: We need to represent the real names here
-        "h100-hbm3",
+        # Hopper
+        "h100-nvl",  # TODO: switch with `torch.cuda.get_device_name()` result
+        "h100-hbm3",  # TODO: switch with `torch.cuda.get_device_name()` result
         "NVIDIA H100 PCIe",
-        "h100-hbm2e",
+        "h100-hbm2e",  # TODO: switch with `torch.cuda.get_device_name()` result
+        # Ada
+        "NVIDIA GeForce RTX 4090",
+        "NVIDIA GeForce RTX 4080",
+        "Tesla L40",
+        "NVIDIA L4",
+        # Ampere
         "NVIDIA A100 80GB PCIe",
         "NVIDIA A100-SXM4-40GB",
+        "NVIDIA GeForce RTX 3090",
+        "NVIDIA GeForce RTX 3090 Ti",
+        "NVIDIA GeForce RTX 3080",
+        "NVIDIA GeForce RTX 3080 Ti",
+        "NVIDIA GeForce RTX 3070",
+        pytest.param("NVIDIA GeForce RTX 3070 Ti", marks=pytest.mark.xfail(raises=AssertionError)),
+        pytest.param("NVIDIA GeForce RTX 3060", marks=pytest.mark.xfail(raises=AssertionError)),
+        pytest.param("NVIDIA GeForce RTX 3060 Ti", marks=pytest.mark.xfail(raises=AssertionError)),
+        pytest.param("NVIDIA GeForce RTX 3050", marks=pytest.mark.xfail(raises=AssertionError)),
+        pytest.param("NVIDIA GeForce RTX 3050 Ti", marks=pytest.mark.xfail(raises=AssertionError)),
+        "NVIDIA A6000",
+        "NVIDIA A40",
         "NVIDIA A10G",
+        # Turing
+        "NVIDIA GeForce RTX 2080 SUPER",
+        "NVIDIA GeForce RTX 2080 Ti",
+        "NVIDIA GeForce RTX 2080",
+        "NVIDIA GeForce RTX 2070 Super",
+        "Quadro RTX 5000 with Max-Q Design",
+        "Tesla T4",
+        "TITAN RTX",
+        # Volta
         "Tesla V100-SXm2-32GB",
         "Tesla V100-PCIE-32GB",
         "Tesla V100S-PCIE-32GB",
-        "Tesla T4",
-        "Quadro RTX 5000 with Max-Q Design",
     ],
 )
 @mock.patch("lightning.fabric.accelerators.cuda._is_ampere_or_later", return_value=False)