lym00
/

nunchaku_svdquant_deepcompressor_0.1.0_quantization_flux.1_kontext_dev_test

Model card Files Files and versions

xet

Community

lym00 commited on Jul 23

Commit

0c1b5cc

verified ·

1 Parent(s): 6f59bc1

Upload scale.py

Browse files

Files changed (1) hide show

scale.py +273 -0

scale.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# -*- coding: utf-8 -*-
+"""Quantization scale module."""
+import math
+import typing as tp
+from dataclasses import dataclass, field
+import torch
+from ...data.dtype import QuantDataType
+from ...data.range import DynamicRange, QuantRange, RangeBound
+from ...data.scale import QuantScale
+from ...data.utils import ScaleUtils
+from ...data.zero import ZeroPointDomain
+from .simple import simple_quantize
+from deepcompressor.utils import tools
+logger = tools.logging.getLogger(__name__)
+__all__ = ["quantize_scale", "QuantScaleInfo"]
+def quantize_scale(
+    s: torch.Tensor,
+    /,
+    *,
+    quant_dtypes: tp.Sequence[QuantDataType],
+    quant_spans: tp.Sequence[float],
+    view_shapes: tp.Sequence[torch.Size],
+) -> QuantScale:
+    """Quantize the scale tensor.
+    Args:
+        s (`torch.Tensor`):
+            The scale tensor.
+        quant_dtypes (`Sequence[QuantDataType]`):
+            The quantization dtypes of the scale tensor.
+        quant_spans (`Sequence[float]`):
+            The quantization spans of the scale tensor.
+        view_shapes (`Sequence[torch.Size]`):
+            The view shapes of the scale tensor.
+    Returns:
+        `QuantScale`:
+            The quantized scale tensor.
+    """
+    # Add validation at the start
+    if s.numel() == 0:
+        raise ValueError("Input tensor is empty")
+    if s.isnan().any() or s.isinf().any():
+        raise ValueError("Input tensor contains NaN or Inf values")
+    if (s == 0).all():
+        logger.warning("Input tensor contains all zeros - this may indicate meta tensor materialization issues")
+        # Create a minimal non-zero tensor to allow quantization to proceed
+        s = torch.ones_like(s) * 1e-6
+    # Add meta tensor check before any operations
+    if s.is_meta:
+        raise RuntimeError("Cannot quantize scale with meta tensor. Ensure model is loaded on actual device.")
+    # Existing validation
+    if s.isnan().any() or s.isinf().any():
+        raise ValueError("Input tensor contains NaN or Inf values")
+    scale = QuantScale()
+    s = s.abs()
+    for view_shape, quant_dtype, quant_span in zip(view_shapes[:-1], quant_dtypes[:-1], quant_spans[:-1], strict=True):
+        s = s.view(view_shape)  # (#g0, rs0, #g1, rs1, #g2, rs2, ...)
+        ss = s.amax(dim=list(range(1, len(view_shape), 2)), keepdim=True)  # i.e., s_dynamic_span
+        ss = simple_quantize(
+            ss / quant_span, has_zero_point=False, quant_dtype=quant_dtype
+        )  # i.e., s_scale = s_dynamic_span / s_quant_span
+        s = s / ss
+        scale.append(ss)
+    view_shape = view_shapes[-1]
+    s = s.view(view_shape)
+    if any(v != 1 for v in view_shape[1::2]):
+        ss = s.amax(dim=list(range(1, len(view_shape), 2)), keepdim=True)
+        ss = simple_quantize(ss / quant_spans[-1], has_zero_point=False, quant_dtype=quant_dtypes[-1])
+    else:
+        assert quant_spans[-1] == 1, "The last quant span must be 1."
+        ss = simple_quantize(s, has_zero_point=False, quant_dtype=quant_dtypes[-1])
+    scale.append(ss)
+    scale.remove_zero()
+    return scale
+@dataclass
+class QuantScaleInfo:
+    # region tensor information
+    tensor_view_shape: torch.Size
+    tensor_quant_dtype: torch.dtype | QuantDataType
+    tensor_zero_domain: ZeroPointDomain | None
+    tensor_quant_range: QuantRange
+    tensor_range_bound: RangeBound | None
+    # endregion
+    default_quant_dtype: torch.dtype | QuantDataType
+    scale_view_shapes: list[torch.Size]
+    scale_quant_dtypes: list[torch.dtype | QuantDataType]
+    exponent_scale_level: int = field(init=False)
+    zero_quant_dtype: torch.dtype | QuantDataType = field(init=False)
+    # region linear scale information
+    linear_tensor_quant_span: float = field(init=False)
+    linear_scale_quant_dtypes: list[torch.dtype | QuantDataType] = field(init=False)
+    linear_scale_view_shapes: list[torch.Size] = field(init=False)
+    linear_scale_quant_spans: list[float] = field(init=False)
+    # endregion
+    # region exponent scale information
+    exponent_tensor_quant_span: float = field(init=False)
+    exponent_scale_quant_dtypes: list[torch.dtype | QuantDataType] = field(init=False)
+    exponent_scale_view_shapes: list[torch.Size] = field(init=False)
+    exponent_scale_quant_spans: list[float] = field(init=False)
+    # endregion
+    @property
+    def has_zero_point(self) -> bool:
+        return self.tensor_zero_domain is not None
+    def __post_init__(self):
+        if isinstance(self.tensor_quant_dtype, torch.dtype):
+            raise NotImplementedError("torch.dtype is not supported yet.")
+        self.tensor_quant_range = QuantRange.construct(
+            self.tensor_quant_dtype, has_zero_point=self.has_zero_point, quant_range=self.tensor_quant_range
+        )
+        self.scale_quant_dtypes = ScaleUtils.infer_scale_dtypes(self.scale_quant_dtypes, self.default_quant_dtype)
+        self.exponent_scale_level = ScaleUtils.infer_exponent_scale_level(self.scale_quant_dtypes)
+        if self.has_zero_point:
+            if self.tensor_zero_domain == ZeroPointDomain.PreScale:
+                self.zero_quant_dtype = self.tensor_quant_dtype
+            elif self.tensor_zero_domain == ZeroPointDomain.PostScale:
+                # TODO: fix zero quant dtype (signed or unsigned)
+                self.zero_quant_dtype = self.scale_quant_dtypes[-1]
+                if isinstance(self.zero_quant_dtype, QuantDataType) and self.zero_quant_dtype.is_exponent:
+                    self.zero_quant_dtype = self.default_quant_dtype
+            else:
+                raise ValueError(f"Unsupported zero point domain: {self.tensor_zero_domain}")
+            self.linear_tensor_quant_span = self.tensor_quant_range.max - self.tensor_quant_range.min
+            self.exponent_tensor_quant_span = 2 ** int(
+                math.log2(self.tensor_quant_range.max) + int(self.tensor_quant_dtype.signed)
+            )
+        else:
+            self.zero_quant_dtype = None
+            self.linear_tensor_quant_span = self.tensor_quant_range.max
+            self.exponent_tensor_quant_span = 2 ** int(math.log2(self.tensor_quant_range.max))
+        if self.exponent_scale_level >= 0 and self.exponent_scale_level < len(self.scale_quant_dtypes):
+            lin_s_dtypes = self.scale_quant_dtypes[: self.exponent_scale_level]
+            exp_s_dtypes = self.scale_quant_dtypes[self.exponent_scale_level :]
+            lin_s_view_shapes = self.scale_view_shapes[: self.exponent_scale_level]
+            exp_s_view_shapes = self.scale_view_shapes[self.exponent_scale_level :]
+            exp_s_spans = ScaleUtils.infer_scale_quant_spans(exp_s_dtypes)
+            lin_s_spans = ScaleUtils.infer_scale_quant_spans(lin_s_dtypes, base=exp_s_spans[-1]) if lin_s_dtypes else []
+        else:
+            lin_s_dtypes, exp_s_dtypes = self.scale_quant_dtypes, []
+            lin_s_view_shapes, exp_s_view_shapes = self.scale_view_shapes, []
+            lin_s_spans, exp_s_spans = ScaleUtils.infer_scale_quant_spans(lin_s_dtypes), []
+        self.linear_scale_quant_dtypes = lin_s_dtypes
+        self.linear_scale_view_shapes = lin_s_view_shapes
+        self.linear_scale_quant_spans = lin_s_spans
+        self.exponent_scale_quant_dtypes = exp_s_dtypes
+        self.exponent_scale_view_shapes = exp_s_view_shapes
+        self.exponent_scale_quant_spans = exp_s_spans
+    def quantize(
+        self,
+        *,
+        # scale-based quantization related arguments
+        scale: torch.Tensor | None = None,
+        zero: torch.Tensor | None = None,
+        # range-based quantization related arguments
+        tensor: torch.Tensor | None = None,
+        dynamic_range: DynamicRange | None = None,
+    ) -> tuple[QuantScale, torch.Tensor]:
+        """Get the quantization scale and zero point of the tensor to be quantized.
+        Args:
+            scale (`torch.Tensor` or `None`, *optional*, defaults to `None`):
+                The scale tensor.
+            zero (`torch.Tensor` or `None`, *optional*, defaults to `None`):
+                The zero point tensor.
+            tensor (`torch.Tensor` or `None`, *optional*, defaults to `None`):
+                Ten tensor to be quantized. This is only used for range-based quantization.
+            dynamic_range (`DynamicRange` or `None`, *optional*, defaults to `None`):
+                The dynamic range of the tensor to be quantized.
+        Returns:
+            `tuple[QuantScale, torch.Tensor]`:
+                The scale and the zero point.
+        """
+        # region step 1: get the dynamic span for range-based scale or the scale tensor
+        if scale is None:
+            range_based = True
+            assert isinstance(tensor, torch.Tensor), "View tensor must be a tensor."
+            dynamic_range = dynamic_range or DynamicRange()
+            dynamic_range = dynamic_range.measure(
+                tensor.view(self.tensor_view_shape),
+                zero_domain=self.tensor_zero_domain,
+                is_float_point=self.tensor_quant_dtype.is_float_point,
+            )
+            dynamic_range = dynamic_range.intersect(self.tensor_range_bound)
+            dynamic_span = (dynamic_range.max - dynamic_range.min) if self.has_zero_point else dynamic_range.max
+        else:
+            range_based = False
+            scale = scale.view(self.scale_view_shapes[-1])
+            assert isinstance(scale, torch.Tensor), "Scale must be a tensor."
+        # endregion
+        # region step 2: get the scale
+        if self.linear_scale_quant_dtypes:
+            if range_based:
+                linear_scale = dynamic_span / self.linear_tensor_quant_span
+            elif self.exponent_scale_quant_dtypes:
+                linear_scale = scale.mul(self.exponent_tensor_quant_span).div(self.linear_tensor_quant_span)
+            else:
+                linear_scale = scale
+            lin_s = quantize_scale(
+                linear_scale,
+                quant_dtypes=self.linear_scale_quant_dtypes,
+                quant_spans=self.linear_scale_quant_spans,
+                view_shapes=self.linear_scale_view_shapes,
+            )
+            assert lin_s.data is not None, "Linear scale tensor is None."
+        if not lin_s.data.is_meta:
+            assert not lin_s.data.isnan().any(), "Linear scale tensor contains NaN."
+            assert not lin_s.data.isinf().any(), "Linear scale tensor contains Inf."
+        else:
+            lin_s = QuantScale()
+        if self.exponent_scale_quant_dtypes:
+            if range_based:
+                exp_scale = dynamic_span / self.exponent_tensor_quant_span
+            else:
+                exp_scale = scale
+            if lin_s.data is not None:
+                lin_s.data = lin_s.data.expand(self.linear_scale_view_shapes[-1]).reshape(self.scale_view_shapes[-1])
+                exp_scale = exp_scale / lin_s.data
+            exp_s = quantize_scale(
+                exp_scale,
+                quant_dtypes=self.exponent_scale_quant_dtypes,
+                quant_spans=self.exponent_scale_quant_spans,
+                view_shapes=self.exponent_scale_view_shapes,
+            )
+            assert exp_s.data is not None, "Exponential scale tensor is None."
+            assert not exp_s.data.isnan().any(), "Exponential scale tensor contains NaN."
+            assert not exp_s.data.isinf().any(), "Exponential scale tensor contains Inf."
+            s = exp_s if lin_s.data is None else lin_s.extend(exp_s)
+        else:
+            s = lin_s
+        # Before the final assertions, add debugging and validation
+        if s.data is None:
+            # Log debugging information
+            print(f"Linear scale dtypes: {self.linear_scale_quant_dtypes}")
+            print(f"Exponent scale dtypes: {self.exponent_scale_quant_dtypes}")
+            if hasattr(lin_s, 'data') and lin_s.data is not None:
+                print(f"Linear scale data shape: {lin_s.data.shape}")
+            raise RuntimeError("Scale computation failed - resulting scale is None")
+        assert s.data is not None, "Scale tensor is None."
+        assert not s.data.isnan().any(), "Scale tensor contains NaN."
+        assert not s.data.isinf().any(), "Scale tensor contains Inf."
+        # endregion
+        # region step 3: get the zero point
+        if self.has_zero_point:
+            if range_based:
+                if self.tensor_zero_domain == ZeroPointDomain.PreScale:
+                    zero = self.tensor_quant_range.min - dynamic_range.min / s.data
+                else:
+                    zero = self.tensor_quant_range.min * s.data - dynamic_range.min
+            assert isinstance(zero, torch.Tensor), "Zero point must be a tensor."
+            z = simple_quantize(zero, has_zero_point=True, quant_dtype=self.zero_quant_dtype)
+        else:
+            z = torch.tensor(0, dtype=s.data.dtype, device=s.data.device)
+        assert not z.isnan().any(), "Zero point tensor contains NaN."
+        assert not z.isinf().any(), "Zero point tensor contains Inf."
+        # endregion
+        return s, z