diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py index 44841c0039..cd81f53153 100644 --- a/src/lightning/fabric/utilities/throughput.py +++ b/src/lightning/fabric/utilities/throughput.py @@ -305,44 +305,204 @@ def measure_flops( _CUDA_FLOPS: Dict[str, Dict[Union[str, torch.dtype], float]] = { - # source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet - # nvidia publishes spec sheet with a 2x sparsity factor - "h100-sxm": { + # Hopper + # source: https://resources.nvidia.com/en-us-tensor-core + "h100 nvl": { torch.float64: 67e12, - torch.float32: 67e12, - "tfloat32": 989e12 / 2, - torch.bfloat16: 1.979e15 / 2, - torch.float16: 1.979e15 / 2, - torch.int8: 3.958e15 / 2, + torch.float32: 133.8e12, + "tfloat32": 989.4e12, + torch.bfloat16: 1978.8e12, + torch.float16: 1978.8e12, + torch.int8: 3957.8e12, }, - "h100-pcie": { - torch.float64: 51e12, - torch.float32: 51e12, - "tfloat32": 756e12 / 2, - torch.bfloat16: 1.513e15 / 2, - torch.float16: 1.513e15 / 2, - torch.int8: 3.026e15 / 2, + "h100 sxm": { + torch.float64: 33.5e12, + torch.float32: 66.9e12, + "tfloat32": 494.7e12, + torch.bfloat16: 989.4e12, + torch.float16: 989.4e12, + torch.int8: 1978.9e12, }, + "h100 pcie": { + torch.float64: 25.6e12, + torch.float32: 51.2e12, + "tfloat32": 378e12, + torch.bfloat16: 756e12, + torch.float16: 756e12, + torch.int8: 1513e12, + }, + # Ada + # source: https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf + "rtx 4090": { + torch.float32: 82.6e12, + "tfloat32": 82.6e12, + torch.bfloat16: 82.6e12, + torch.float16: 82.6e12, + torch.int8: 660.6e12, + "int4": 1321.2e12, + }, + "rtx 4080": { + torch.float32: 48.7e12, + "tfloat32": 48.7e12, + torch.bfloat16: 48.7e12, + torch.float16: 48.7e12, + torch.int8: 389.9e12, + "int4": 779.8e12, + }, + "l4": { + torch.float32: 30.3e12, + "tfloat32": 60e12, + torch.bfloat16: 121e12, + torch.float16: 121e12, + torch.int8: 242e12, + "int4": 484e12, + }, + "l40": { + torch.float32: 90.5e12, + "tfloat32": 90.5e12, + torch.bfloat16: 181e12, + torch.float16: 181e12, + torch.int8: 362e12, + "int4": 724e12, + }, + # Ampere # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf # sxm and pcie have same flop counts "a100": { - torch.float64: 19.5e12, + torch.float64: 9.7e12, torch.float32: 19.5e12, "tfloat32": 156e12, torch.bfloat16: 312e12, torch.float16: 312e12, + torch.int8: 624e12, + }, + "a6000": { + torch.float32: 38.7e12, + "tfloat32": 77.4e12, + torch.bfloat16: 38.7e12, + torch.float16: 38.7e12, + torch.int8: 309.7e12, + "int4": 619.3e12, + }, + "a40": { + torch.float32: 37.4e12, + "tfloat32": 74.8e12, + torch.bfloat16: 37.4e12, + torch.float16: 37.4e12, + torch.int8: 299.3e12, + "int4": 598.7e12, }, # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf - "a10g": {torch.float32: 31.2e12, "tfloat32": 62.5e12, torch.bfloat16: 125e12, torch.float16: 125e12}, - # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf - "v100-sxm": {torch.float64: 7.8e12, torch.float32: 15.7e12, torch.float16: 125e12}, - "v100-pcie": {torch.float64: 7e12, torch.float32: 14e12, torch.float16: 112e12}, - "v100s-pcie": {torch.float64: 8.2e12, torch.float32: 16.4e12, torch.float16: 130e12}, + "a10g": { + torch.float32: 31.2e12, + "tfloat32": 62.5e12, + torch.bfloat16: 125e12, + torch.float16: 125e12, + torch.int8: 250e12, + "int4": 500e12, + }, + "rtx 3090 ti": { + torch.float32: 40e12, + "tfloat32": 40e12, + torch.bfloat16: 40e12, + torch.float16: 40e12, + torch.int8: 320e12, + "int4": 640e12, + }, + "rtx 3090": { + torch.float32: 35.6e12, + "tfloat32": 35.6e12, + torch.bfloat16: 35.6e12, + torch.float16: 35.6e12, + torch.int8: 284e12, + "int4": 568e12, + }, + "rtx 3080 ti": { + torch.float32: 34.1e12, + "tfloat32": 34.1e12, + torch.bfloat16: 34.1e12, + torch.float16: 34.1e12, + torch.int8: 272.8e12, + "int4": 546.6e12, + }, + "rtx 3080": { + torch.float32: 29.8e12, + "tfloat32": 29.8e12, + torch.bfloat16: 29.8e12, + torch.float16: 29.8e12, + torch.int8: 238e12, + "int4": 476e12, + }, + "rtx 3070": { + torch.float32: 20.3e12, + "tfloat32": 20.3e12, + torch.bfloat16: 20.3e12, + torch.float16: 20.3e12, + torch.int8: 162.6e12, + "int4": 325.2e12, + }, + # Turing # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf # sxm and pcie have same flop counts - "t4": {torch.float32: 8.1e12, torch.float16: 65e12, torch.int8: 130e12}, + "t4": { + torch.float32: 8.1e12, + torch.float16: 65e12, + torch.int8: 130e12, + "int4": 260e12, + }, # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf - "quadro rtx 5000": {torch.float32: 11.2e12, torch.float16: 89.2e12}, + "quadro rtx 5000": { + torch.float32: 11.2e12, + torch.float16: 89.2e12, + }, + "rtx 2080 super": { + torch.float32: 11.2e12, + torch.float16: 22.3e12, + torch.int8: 178.4e12, + "int4": 356.8e12, + }, + "rtx 2080 ti": { + torch.float32: 14.2e12, + torch.float16: 28.5e12, + torch.int8: 227.7e12, + "int4": 455.4e12, + }, + "rtx 2080": { + torch.float32: 10.6e12, + torch.float16: 21.2e12, + torch.int8: 169.6e12, + "int4": 339.1e12, + }, + # https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf + "rtx 2070 super": { + torch.float32: 9.1e12, + torch.float16: 18.1e12, + torch.int8: 145e12, + "int4": 290e12, + }, + "titan rtx": { + torch.float32: 16.3e12, + torch.float16: 32.6e12, + torch.int8: 261e12, + "int4": 522e12, + }, + # Volta + # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf + "v100 sxm": { + torch.float64: 7.8e12, + torch.float32: 15.7e12, + torch.float16: 125e12, + }, + "v100 pcie": { + torch.float64: 7e12, + torch.float32: 14e12, + torch.float16: 112e12, + }, + "v100s pcie": { + torch.float64: 8.2e12, + torch.float32: 16.4e12, + torch.float16: 130e12, + }, } _TPU_FLOPS = { @@ -370,31 +530,51 @@ def get_available_flops(device: torch.device, dtype: Union[torch.dtype, str]) -> if device.type == "cuda": device_name = torch.cuda.get_device_name(device) chip = device_name.lower() - if "h100" in chip and "hbm3" in chip: - chip = "h100-sxm" - elif "h100" in chip and ("pcie" in chip or "hbm2e" in chip): - chip = "h100-pcie" + if "h100" in chip: + if "hbm3" in chip: + chip = "h100 sxm" + elif "nvl" in chip: + chip = "h100 nvl" + elif "pcie" in chip or "hbm2e" in chip: + chip = "h100 pcie" + elif "l4" in chip: + chip = "l40" if "tesla" in chip else "l4" + elif "geforce rtx" in chip: + number = chip.split(" ")[3] + extra = "" + if "super" in chip: + extra = " super" + elif "ti" in chip: + extra = " ti" + chip = f"rtx {number}{extra}" + elif "a6000" in chip: + chip = "a6000" elif "a100" in chip: chip = "a100" + elif "a40" in chip: + chip = "a40" elif "a10g" in chip: chip = "a10g" - elif "v100-sxm" in chip: - chip = "v100-sxm" - elif "v100-pcie" in chip: - chip = "v100-pcie" - elif "v100s-pcie" in chip: - chip = "v100s-pcie" elif "t4" in chip: chip = "t4" elif "quadro rtx 5000" in chip: chip = "quadro rtx 5000" + elif "titan rtx" in chip: + chip = "titan rtx" + elif "v100-sxm" in chip: + chip = "v100 sxm" + elif "v100-pcie" in chip: + chip = "v100 pcie" + elif "v100s-pcie" in chip: + chip = "v100s pcie" else: # the flops list is not exhaustive, return with a warning rank_zero_warn(f"FLOPs not found for {device_name!r}") return None if chip not in _CUDA_FLOPS: - # if we were able to parse the chip, it should be in the flops list - raise RuntimeError(f"FLOPs not found for {device_name!r}, chip is {chip!r}") + # parsing is implemented but we don't have the stats + rank_zero_warn(f"FLOPs not found for {device_name!r}, chip is {chip!r}") + return None dtype_to_flops = _CUDA_FLOPS[chip] if dtype is torch.float32: from lightning.fabric.accelerators.cuda import _is_ampere_or_later diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py index 06e63706d5..55d35e8707 100644 --- a/tests/tests_fabric/utilities/test_throughput.py +++ b/tests/tests_fabric/utilities/test_throughput.py @@ -36,7 +36,7 @@ def test_measure_flops(): def test_get_available_flops(xla_available): with mock.patch("torch.cuda.get_device_name", return_value="NVIDIA H100 PCIe"): flops = get_available_flops(torch.device("cuda"), torch.bfloat16) - assert flops == 1.513e15 / 2 + assert flops == 756e12 with pytest.warns(match="not found for 'CocoNut"), mock.patch("torch.cuda.get_device_name", return_value="CocoNut"): assert get_available_flops(torch.device("cuda"), torch.bfloat16) is None @@ -64,18 +64,44 @@ def test_get_available_flops(xla_available): @pytest.mark.parametrize( "device_name", [ - # TODO: We need to represent the real names here - "h100-hbm3", + # Hopper + "h100-nvl", # TODO: switch with `torch.cuda.get_device_name()` result + "h100-hbm3", # TODO: switch with `torch.cuda.get_device_name()` result "NVIDIA H100 PCIe", - "h100-hbm2e", + "h100-hbm2e", # TODO: switch with `torch.cuda.get_device_name()` result + # Ada + "NVIDIA GeForce RTX 4090", + "NVIDIA GeForce RTX 4080", + "Tesla L40", + "NVIDIA L4", + # Ampere "NVIDIA A100 80GB PCIe", "NVIDIA A100-SXM4-40GB", + "NVIDIA GeForce RTX 3090", + "NVIDIA GeForce RTX 3090 Ti", + "NVIDIA GeForce RTX 3080", + "NVIDIA GeForce RTX 3080 Ti", + "NVIDIA GeForce RTX 3070", + pytest.param("NVIDIA GeForce RTX 3070 Ti", marks=pytest.mark.xfail(raises=AssertionError)), + pytest.param("NVIDIA GeForce RTX 3060", marks=pytest.mark.xfail(raises=AssertionError)), + pytest.param("NVIDIA GeForce RTX 3060 Ti", marks=pytest.mark.xfail(raises=AssertionError)), + pytest.param("NVIDIA GeForce RTX 3050", marks=pytest.mark.xfail(raises=AssertionError)), + pytest.param("NVIDIA GeForce RTX 3050 Ti", marks=pytest.mark.xfail(raises=AssertionError)), + "NVIDIA A6000", + "NVIDIA A40", "NVIDIA A10G", + # Turing + "NVIDIA GeForce RTX 2080 SUPER", + "NVIDIA GeForce RTX 2080 Ti", + "NVIDIA GeForce RTX 2080", + "NVIDIA GeForce RTX 2070 Super", + "Quadro RTX 5000 with Max-Q Design", + "Tesla T4", + "TITAN RTX", + # Volta "Tesla V100-SXm2-32GB", "Tesla V100-PCIE-32GB", "Tesla V100S-PCIE-32GB", - "Tesla T4", - "Quadro RTX 5000 with Max-Q Design", ], ) @mock.patch("lightning.fabric.accelerators.cuda._is_ampere_or_later", return_value=False)