Delegate quantization kernel to torch.fake_quantize

Lee, Kyunggeun · quic-kyunggeu · GitHub Enterprise · commit a444995fb477 · 2025-04-17T17:49:28.000-07:00
Signed-off-by: Kyunggeun Lee &lt;quic_kyunggeu@quicinc.com&gt;
Co-authored-by: Kyunggeun Lee &lt;quic_kyunggeu@quicinc.com&gt;
diff --git a/TrainingExtensions/torch/src/python/aimet_torch/v2/quantization/affine/backends/torch_builtins.py b/TrainingExtensions/torch/src/python/aimet_torch/v2/quantization/affine/backends/torch_builtins.py
@@ -36,14 +36,18 @@
 # =============================================================================
 """ Default quantization backend for quantizing weights and activations """
 import functools
-from typing import Callable, Optional, List
+from packaging import version
+from typing import Callable, Optional, List, Tuple
 import torch
 from aimet_torch.v2.utils import _is_expandable, _ContextManager
 import aimet_torch.v2.experimental.onnx._export as _onnx
-from packaging import version
 
 
-if version.parse(torch.__version__) >= version.parse("2.0.0"):
+_torch_version: Tuple[int, int, int] = (version.parse(torch.__version__).major,
+                                        version.parse(torch.__version__).minor,
+                                        version.parse(torch.__version__).micro)
+
+if _torch_version >= (2, 0, 0):
     _compile = torch.compile
 else:
     _compile = lambda fn: fn
@@ -155,6 +159,8 @@ def quantize(tensor: torch.Tensor, scale: torch.Tensor, offset: torch.Tensor,
 
 
 
+_ALLOW_FAST_FORWARD = True # temporary flag for debugging
+
 @_onnx.register_symbolic(_onnx.quantize_dequantize_symbolic)
 def quantize_dequantize(tensor: torch.Tensor, scale: torch.Tensor, offset: torch.Tensor,
                         qmin: int, qmax: int, block_size: Optional[List] = None) -> torch.Tensor:
@@ -170,6 +176,26 @@ def quantize_dequantize(tensor: torch.Tensor, scale: torch.Tensor, offset: torch
     """
     _validate_arguments(tensor, scale, qmin, qmax, block_size)
 
+    _fast_forward = _ALLOW_FAST_FORWARD
+
+    # torch.fake_quantize doesn't support blockwise quantization
+    _fast_forward &= block_size is None
+
+    # torch.fake_quantize doesn't support JIT tracing
+    _fast_forward &= not torch.jit.is_tracing()
+
+    # torch.fake_quantize doesn't compute gradients for scale/offset
+    _fast_forward &= (not scale.requires_grad and not offset.requires_grad) or (not torch.is_grad_enabled())
+
+    # if user explicitly designated specific rounding function, honor it strictly
+    _fast_forward &= (_round_fn == torch.round and _round_fn_inplace == torch.round_)
+
+    if _fast_forward:
+        ret = _torch_fake_quantize(tensor, scale, offset, qmin, qmax)
+
+        if ret is not None:
+            return ret
+
     output_dtype = internal_dtype = tensor.dtype
 
     if not _is_numerically_stable(internal_dtype, qmin, qmax):
@@ -190,6 +216,60 @@ def quantize_dequantize(tensor: torch.Tensor, scale: torch.Tensor, offset: torch
                                   offset.to(internal_dtype),
                                   qmin, qmax).to(output_dtype).view(orig_tensor_shape)
 
+
+def _torch_fake_quantize(tensor: torch.Tensor,
+                         scale: torch.Tensor,
+                         offset: torch.Tensor,
+                         qmin: int,
+                         qmax: int) -> Optional[torch.Tensor]:
+    scale_internal_dtype = torch.float32
+    tensor_internal_dtype = tensor.dtype
+
+    if _torch_version < (2, 6, 0) and tensor_internal_dtype == torch.bfloat16:
+        # torch.fake_quantize only supports bfloat16 in >=2.6.0
+        tensor_internal_dtype = torch.float32
+
+    is_per_tensor = scale.numel() == offset.numel() == 1
+
+    if is_per_tensor:
+        return torch.fake_quantize_per_tensor_affine(tensor.to(tensor_internal_dtype),
+                                                     scale.view(()).to(scale_internal_dtype),
+                                                     -offset.to(torch.int32).view(()),
+                                                     qmin, qmax).to(tensor.dtype)
+
+    scale = scale.view(*(1 for _ in range(tensor.dim() - scale.dim())),
+                       *scale.shape)
+    offset = offset.view(*(1 for _ in range(tensor.dim() - offset.dim())),
+                         *offset.shape)
+
+    is_per_channel = scale.shape == offset.shape and all(
+        scale_dim in (1, tensor_dim)
+        for scale_dim, tensor_dim
+        in zip(scale.shape, tensor.shape)
+    )
+
+    if is_per_channel:
+        axes = [
+            axis for axis, scale_dim in enumerate(scale.shape) if scale_dim != 1
+        ]
+        assert axes
+
+        if len(axes) == 1:
+            axis, = axes
+            try:
+                return torch.fake_quantize_per_channel_affine(tensor.to(tensor_internal_dtype),
+                                                              scale.flatten().to(scale_internal_dtype),
+                                                              -offset.to(torch.int32).flatten(),
+                                                              axis, qmin, qmax).to(tensor.dtype)
+            except RuntimeError:
+                # NOTE: torch.fake_quantize_per_channel_affine throws runtime error
+                # if zero_point is not in [qmin, qmax]. In practice, this error will
+                # almost never occur because per-channel quantization always uses zero_point=0
+                return None
+
+    return None
+
+
 @_onnx.register_symbolic(_onnx.dequantize_symbolic)
 def dequantize(tensor: torch.Tensor, scale: torch.Tensor, offset: torch.Tensor, block_size: Optional[List] = None) \
         -> torch.Tensor:
diff --git a/TrainingExtensions/torch/test/python/v2/quantization/affine/backends/test_backend.py b/TrainingExtensions/torch/test/python/v2/quantization/affine/backends/test_backend.py
@@ -737,3 +737,87 @@ def test_invalid_block_size(self, backend_module):
                                                block_size=[1, 3])
         backend_module._validate_arguments(torch.randn(1, 4), torch.randn(1, 2), torch.randn(1, 2),
                                            block_size=[1, 2])
+
+
+
+@pytest.mark.parametrize(
+    "qmin,   qmax,    offset", [
+    (-8,     7,       0),
+    (0,      15,      0),
+    (0,      15,      8),
+    (-128,   127,     0),
+    (0,      255,     0),
+    (0,      255,     128),
+    (-2**15, 2**15-1, 0),
+    (0,      2**16-1, 0),
+    (0,      2**16-1, 2**15),
+])
+@pytest.mark.parametrize("device", [
+    "cpu",
+    *(("cuda",) if torch.cuda.is_available() else ())
+])
+@pytest.mark.parametrize("dtype", [
+    torch.float32,
+    torch.float16,
+    torch.bfloat16,
+])
+def test_cross_validate_torch_fake_quantize(qmin, qmax, offset, dtype, device):
+    """
+    Given same inputs, the following three functions should always produce the same output
+      * quantize_dequantize
+      * QuantDequantFunc.apply
+      * _torch_fake_quantize
+    """
+    scale = torch.tensor([0.1], dtype=torch.float32, device=device)
+    offset = torch.tensor([offset], dtype=torch.float32, device=device)
+    tensor = scale * torch.tensor([
+        qmin - .5, qmin, qmin + .5, qmax - .5, qmax, qmax + .5
+    ], device=device)
+    tensor = tensor.to(dtype)
+
+    expected = tensor.to(torch.float32)\
+                     .div(scale)\
+                     .round()\
+                     .sub(offset)\
+                     .clamp(qmin, qmax)\
+                     .add(offset)\
+                     .mul(scale)\
+                     .to(dtype)
+
+    # Allow off-by-one error for float16 and bfloat16
+    atol = scale.item() if dtype in (torch.float16, torch.bfloat16) else 1e-8
+
+    out1 = torch_builtins.quantize_dequantize(tensor, scale, offset, qmin, qmax)
+    out2 = torch_builtins.QuantDequantFunc.apply(tensor, scale, offset, qmin, qmax).to(dtype)
+    out3 = torch_builtins._torch_fake_quantize(tensor, scale, offset, qmin, qmax)
+
+    assert torch.allclose(out1, expected, atol=atol)
+    assert torch.allclose(out2, expected, atol=atol)
+    if out3 is not None:
+        assert torch.allclose(out3, expected, atol=atol)
+
+    scale = torch.stack([
+        scale,
+        scale,
+    ])
+    offset = torch.stack([
+        offset,
+        offset,
+    ])
+    tensor = torch.stack([
+        tensor,
+        tensor
+    ])
+    expected = torch.stack([
+        expected,
+        expected,
+    ])
+
+    out1 = torch_builtins.quantize_dequantize(tensor, scale, offset, qmin, qmax)
+    out2 = torch_builtins.QuantDequantFunc.apply(tensor, scale, offset, qmin, qmax).to(dtype)
+    out3 = torch_builtins._torch_fake_quantize(tensor, scale, offset, qmin, qmax)
+
+    assert torch.allclose(out1, expected, atol=atol)
+    assert torch.allclose(out2, expected, atol=atol)
+    if out3 is not None:
+        assert torch.allclose(out3, expected, atol=atol)