Upload scale.py
Browse files
scale.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Quantization scale module."""
|
3 |
+
|
4 |
+
import math
|
5 |
+
import typing as tp
|
6 |
+
from dataclasses import dataclass, field
|
7 |
+
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from ...data.dtype import QuantDataType
|
11 |
+
from ...data.range import DynamicRange, QuantRange, RangeBound
|
12 |
+
from ...data.scale import QuantScale
|
13 |
+
from ...data.utils import ScaleUtils
|
14 |
+
from ...data.zero import ZeroPointDomain
|
15 |
+
from .simple import simple_quantize
|
16 |
+
|
17 |
+
from deepcompressor.utils import tools
|
18 |
+
logger = tools.logging.getLogger(__name__)
|
19 |
+
|
20 |
+
__all__ = ["quantize_scale", "QuantScaleInfo"]
|
21 |
+
|
22 |
+
|
23 |
+
def quantize_scale(
|
24 |
+
s: torch.Tensor,
|
25 |
+
/,
|
26 |
+
*,
|
27 |
+
quant_dtypes: tp.Sequence[QuantDataType],
|
28 |
+
quant_spans: tp.Sequence[float],
|
29 |
+
view_shapes: tp.Sequence[torch.Size],
|
30 |
+
) -> QuantScale:
|
31 |
+
"""Quantize the scale tensor.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
s (`torch.Tensor`):
|
35 |
+
The scale tensor.
|
36 |
+
quant_dtypes (`Sequence[QuantDataType]`):
|
37 |
+
The quantization dtypes of the scale tensor.
|
38 |
+
quant_spans (`Sequence[float]`):
|
39 |
+
The quantization spans of the scale tensor.
|
40 |
+
view_shapes (`Sequence[torch.Size]`):
|
41 |
+
The view shapes of the scale tensor.
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
`QuantScale`:
|
45 |
+
The quantized scale tensor.
|
46 |
+
"""
|
47 |
+
# Add validation at the start
|
48 |
+
if s.numel() == 0:
|
49 |
+
raise ValueError("Input tensor is empty")
|
50 |
+
if s.isnan().any() or s.isinf().any():
|
51 |
+
raise ValueError("Input tensor contains NaN or Inf values")
|
52 |
+
if (s == 0).all():
|
53 |
+
logger.warning("Input tensor contains all zeros - this may indicate meta tensor materialization issues")
|
54 |
+
# Create a minimal non-zero tensor to allow quantization to proceed
|
55 |
+
s = torch.ones_like(s) * 1e-6
|
56 |
+
|
57 |
+
# Add meta tensor check before any operations
|
58 |
+
if s.is_meta:
|
59 |
+
raise RuntimeError("Cannot quantize scale with meta tensor. Ensure model is loaded on actual device.")
|
60 |
+
|
61 |
+
# Existing validation
|
62 |
+
if s.isnan().any() or s.isinf().any():
|
63 |
+
raise ValueError("Input tensor contains NaN or Inf values")
|
64 |
+
|
65 |
+
scale = QuantScale()
|
66 |
+
s = s.abs()
|
67 |
+
for view_shape, quant_dtype, quant_span in zip(view_shapes[:-1], quant_dtypes[:-1], quant_spans[:-1], strict=True):
|
68 |
+
s = s.view(view_shape) # (#g0, rs0, #g1, rs1, #g2, rs2, ...)
|
69 |
+
ss = s.amax(dim=list(range(1, len(view_shape), 2)), keepdim=True) # i.e., s_dynamic_span
|
70 |
+
ss = simple_quantize(
|
71 |
+
ss / quant_span, has_zero_point=False, quant_dtype=quant_dtype
|
72 |
+
) # i.e., s_scale = s_dynamic_span / s_quant_span
|
73 |
+
s = s / ss
|
74 |
+
scale.append(ss)
|
75 |
+
view_shape = view_shapes[-1]
|
76 |
+
s = s.view(view_shape)
|
77 |
+
if any(v != 1 for v in view_shape[1::2]):
|
78 |
+
ss = s.amax(dim=list(range(1, len(view_shape), 2)), keepdim=True)
|
79 |
+
ss = simple_quantize(ss / quant_spans[-1], has_zero_point=False, quant_dtype=quant_dtypes[-1])
|
80 |
+
else:
|
81 |
+
assert quant_spans[-1] == 1, "The last quant span must be 1."
|
82 |
+
ss = simple_quantize(s, has_zero_point=False, quant_dtype=quant_dtypes[-1])
|
83 |
+
scale.append(ss)
|
84 |
+
scale.remove_zero()
|
85 |
+
return scale
|
86 |
+
|
87 |
+
|
88 |
+
@dataclass
|
89 |
+
class QuantScaleInfo:
|
90 |
+
# region tensor information
|
91 |
+
tensor_view_shape: torch.Size
|
92 |
+
tensor_quant_dtype: torch.dtype | QuantDataType
|
93 |
+
tensor_zero_domain: ZeroPointDomain | None
|
94 |
+
tensor_quant_range: QuantRange
|
95 |
+
tensor_range_bound: RangeBound | None
|
96 |
+
# endregion
|
97 |
+
default_quant_dtype: torch.dtype | QuantDataType
|
98 |
+
scale_view_shapes: list[torch.Size]
|
99 |
+
scale_quant_dtypes: list[torch.dtype | QuantDataType]
|
100 |
+
exponent_scale_level: int = field(init=False)
|
101 |
+
zero_quant_dtype: torch.dtype | QuantDataType = field(init=False)
|
102 |
+
# region linear scale information
|
103 |
+
linear_tensor_quant_span: float = field(init=False)
|
104 |
+
linear_scale_quant_dtypes: list[torch.dtype | QuantDataType] = field(init=False)
|
105 |
+
linear_scale_view_shapes: list[torch.Size] = field(init=False)
|
106 |
+
linear_scale_quant_spans: list[float] = field(init=False)
|
107 |
+
# endregion
|
108 |
+
# region exponent scale information
|
109 |
+
exponent_tensor_quant_span: float = field(init=False)
|
110 |
+
exponent_scale_quant_dtypes: list[torch.dtype | QuantDataType] = field(init=False)
|
111 |
+
exponent_scale_view_shapes: list[torch.Size] = field(init=False)
|
112 |
+
exponent_scale_quant_spans: list[float] = field(init=False)
|
113 |
+
# endregion
|
114 |
+
|
115 |
+
@property
|
116 |
+
def has_zero_point(self) -> bool:
|
117 |
+
return self.tensor_zero_domain is not None
|
118 |
+
|
119 |
+
def __post_init__(self):
|
120 |
+
if isinstance(self.tensor_quant_dtype, torch.dtype):
|
121 |
+
raise NotImplementedError("torch.dtype is not supported yet.")
|
122 |
+
self.tensor_quant_range = QuantRange.construct(
|
123 |
+
self.tensor_quant_dtype, has_zero_point=self.has_zero_point, quant_range=self.tensor_quant_range
|
124 |
+
)
|
125 |
+
self.scale_quant_dtypes = ScaleUtils.infer_scale_dtypes(self.scale_quant_dtypes, self.default_quant_dtype)
|
126 |
+
self.exponent_scale_level = ScaleUtils.infer_exponent_scale_level(self.scale_quant_dtypes)
|
127 |
+
if self.has_zero_point:
|
128 |
+
if self.tensor_zero_domain == ZeroPointDomain.PreScale:
|
129 |
+
self.zero_quant_dtype = self.tensor_quant_dtype
|
130 |
+
elif self.tensor_zero_domain == ZeroPointDomain.PostScale:
|
131 |
+
# TODO: fix zero quant dtype (signed or unsigned)
|
132 |
+
self.zero_quant_dtype = self.scale_quant_dtypes[-1]
|
133 |
+
if isinstance(self.zero_quant_dtype, QuantDataType) and self.zero_quant_dtype.is_exponent:
|
134 |
+
self.zero_quant_dtype = self.default_quant_dtype
|
135 |
+
else:
|
136 |
+
raise ValueError(f"Unsupported zero point domain: {self.tensor_zero_domain}")
|
137 |
+
self.linear_tensor_quant_span = self.tensor_quant_range.max - self.tensor_quant_range.min
|
138 |
+
self.exponent_tensor_quant_span = 2 ** int(
|
139 |
+
math.log2(self.tensor_quant_range.max) + int(self.tensor_quant_dtype.signed)
|
140 |
+
)
|
141 |
+
else:
|
142 |
+
self.zero_quant_dtype = None
|
143 |
+
self.linear_tensor_quant_span = self.tensor_quant_range.max
|
144 |
+
self.exponent_tensor_quant_span = 2 ** int(math.log2(self.tensor_quant_range.max))
|
145 |
+
if self.exponent_scale_level >= 0 and self.exponent_scale_level < len(self.scale_quant_dtypes):
|
146 |
+
lin_s_dtypes = self.scale_quant_dtypes[: self.exponent_scale_level]
|
147 |
+
exp_s_dtypes = self.scale_quant_dtypes[self.exponent_scale_level :]
|
148 |
+
lin_s_view_shapes = self.scale_view_shapes[: self.exponent_scale_level]
|
149 |
+
exp_s_view_shapes = self.scale_view_shapes[self.exponent_scale_level :]
|
150 |
+
exp_s_spans = ScaleUtils.infer_scale_quant_spans(exp_s_dtypes)
|
151 |
+
lin_s_spans = ScaleUtils.infer_scale_quant_spans(lin_s_dtypes, base=exp_s_spans[-1]) if lin_s_dtypes else []
|
152 |
+
else:
|
153 |
+
lin_s_dtypes, exp_s_dtypes = self.scale_quant_dtypes, []
|
154 |
+
lin_s_view_shapes, exp_s_view_shapes = self.scale_view_shapes, []
|
155 |
+
lin_s_spans, exp_s_spans = ScaleUtils.infer_scale_quant_spans(lin_s_dtypes), []
|
156 |
+
self.linear_scale_quant_dtypes = lin_s_dtypes
|
157 |
+
self.linear_scale_view_shapes = lin_s_view_shapes
|
158 |
+
self.linear_scale_quant_spans = lin_s_spans
|
159 |
+
self.exponent_scale_quant_dtypes = exp_s_dtypes
|
160 |
+
self.exponent_scale_view_shapes = exp_s_view_shapes
|
161 |
+
self.exponent_scale_quant_spans = exp_s_spans
|
162 |
+
|
163 |
+
def quantize(
|
164 |
+
self,
|
165 |
+
*,
|
166 |
+
# scale-based quantization related arguments
|
167 |
+
scale: torch.Tensor | None = None,
|
168 |
+
zero: torch.Tensor | None = None,
|
169 |
+
# range-based quantization related arguments
|
170 |
+
tensor: torch.Tensor | None = None,
|
171 |
+
dynamic_range: DynamicRange | None = None,
|
172 |
+
) -> tuple[QuantScale, torch.Tensor]:
|
173 |
+
"""Get the quantization scale and zero point of the tensor to be quantized.
|
174 |
+
|
175 |
+
Args:
|
176 |
+
scale (`torch.Tensor` or `None`, *optional*, defaults to `None`):
|
177 |
+
The scale tensor.
|
178 |
+
zero (`torch.Tensor` or `None`, *optional*, defaults to `None`):
|
179 |
+
The zero point tensor.
|
180 |
+
tensor (`torch.Tensor` or `None`, *optional*, defaults to `None`):
|
181 |
+
Ten tensor to be quantized. This is only used for range-based quantization.
|
182 |
+
dynamic_range (`DynamicRange` or `None`, *optional*, defaults to `None`):
|
183 |
+
The dynamic range of the tensor to be quantized.
|
184 |
+
|
185 |
+
Returns:
|
186 |
+
`tuple[QuantScale, torch.Tensor]`:
|
187 |
+
The scale and the zero point.
|
188 |
+
"""
|
189 |
+
# region step 1: get the dynamic span for range-based scale or the scale tensor
|
190 |
+
if scale is None:
|
191 |
+
range_based = True
|
192 |
+
assert isinstance(tensor, torch.Tensor), "View tensor must be a tensor."
|
193 |
+
dynamic_range = dynamic_range or DynamicRange()
|
194 |
+
dynamic_range = dynamic_range.measure(
|
195 |
+
tensor.view(self.tensor_view_shape),
|
196 |
+
zero_domain=self.tensor_zero_domain,
|
197 |
+
is_float_point=self.tensor_quant_dtype.is_float_point,
|
198 |
+
)
|
199 |
+
dynamic_range = dynamic_range.intersect(self.tensor_range_bound)
|
200 |
+
dynamic_span = (dynamic_range.max - dynamic_range.min) if self.has_zero_point else dynamic_range.max
|
201 |
+
else:
|
202 |
+
range_based = False
|
203 |
+
scale = scale.view(self.scale_view_shapes[-1])
|
204 |
+
assert isinstance(scale, torch.Tensor), "Scale must be a tensor."
|
205 |
+
# endregion
|
206 |
+
# region step 2: get the scale
|
207 |
+
if self.linear_scale_quant_dtypes:
|
208 |
+
if range_based:
|
209 |
+
linear_scale = dynamic_span / self.linear_tensor_quant_span
|
210 |
+
elif self.exponent_scale_quant_dtypes:
|
211 |
+
linear_scale = scale.mul(self.exponent_tensor_quant_span).div(self.linear_tensor_quant_span)
|
212 |
+
else:
|
213 |
+
linear_scale = scale
|
214 |
+
lin_s = quantize_scale(
|
215 |
+
linear_scale,
|
216 |
+
quant_dtypes=self.linear_scale_quant_dtypes,
|
217 |
+
quant_spans=self.linear_scale_quant_spans,
|
218 |
+
view_shapes=self.linear_scale_view_shapes,
|
219 |
+
)
|
220 |
+
assert lin_s.data is not None, "Linear scale tensor is None."
|
221 |
+
if not lin_s.data.is_meta:
|
222 |
+
assert not lin_s.data.isnan().any(), "Linear scale tensor contains NaN."
|
223 |
+
assert not lin_s.data.isinf().any(), "Linear scale tensor contains Inf."
|
224 |
+
else:
|
225 |
+
lin_s = QuantScale()
|
226 |
+
if self.exponent_scale_quant_dtypes:
|
227 |
+
if range_based:
|
228 |
+
exp_scale = dynamic_span / self.exponent_tensor_quant_span
|
229 |
+
else:
|
230 |
+
exp_scale = scale
|
231 |
+
if lin_s.data is not None:
|
232 |
+
lin_s.data = lin_s.data.expand(self.linear_scale_view_shapes[-1]).reshape(self.scale_view_shapes[-1])
|
233 |
+
exp_scale = exp_scale / lin_s.data
|
234 |
+
exp_s = quantize_scale(
|
235 |
+
exp_scale,
|
236 |
+
quant_dtypes=self.exponent_scale_quant_dtypes,
|
237 |
+
quant_spans=self.exponent_scale_quant_spans,
|
238 |
+
view_shapes=self.exponent_scale_view_shapes,
|
239 |
+
)
|
240 |
+
assert exp_s.data is not None, "Exponential scale tensor is None."
|
241 |
+
assert not exp_s.data.isnan().any(), "Exponential scale tensor contains NaN."
|
242 |
+
assert not exp_s.data.isinf().any(), "Exponential scale tensor contains Inf."
|
243 |
+
s = exp_s if lin_s.data is None else lin_s.extend(exp_s)
|
244 |
+
else:
|
245 |
+
s = lin_s
|
246 |
+
|
247 |
+
# Before the final assertions, add debugging and validation
|
248 |
+
if s.data is None:
|
249 |
+
# Log debugging information
|
250 |
+
print(f"Linear scale dtypes: {self.linear_scale_quant_dtypes}")
|
251 |
+
print(f"Exponent scale dtypes: {self.exponent_scale_quant_dtypes}")
|
252 |
+
if hasattr(lin_s, 'data') and lin_s.data is not None:
|
253 |
+
print(f"Linear scale data shape: {lin_s.data.shape}")
|
254 |
+
raise RuntimeError("Scale computation failed - resulting scale is None")
|
255 |
+
assert s.data is not None, "Scale tensor is None."
|
256 |
+
assert not s.data.isnan().any(), "Scale tensor contains NaN."
|
257 |
+
assert not s.data.isinf().any(), "Scale tensor contains Inf."
|
258 |
+
# endregion
|
259 |
+
# region step 3: get the zero point
|
260 |
+
if self.has_zero_point:
|
261 |
+
if range_based:
|
262 |
+
if self.tensor_zero_domain == ZeroPointDomain.PreScale:
|
263 |
+
zero = self.tensor_quant_range.min - dynamic_range.min / s.data
|
264 |
+
else:
|
265 |
+
zero = self.tensor_quant_range.min * s.data - dynamic_range.min
|
266 |
+
assert isinstance(zero, torch.Tensor), "Zero point must be a tensor."
|
267 |
+
z = simple_quantize(zero, has_zero_point=True, quant_dtype=self.zero_quant_dtype)
|
268 |
+
else:
|
269 |
+
z = torch.tensor(0, dtype=s.data.dtype, device=s.data.device)
|
270 |
+
assert not z.isnan().any(), "Zero point tensor contains NaN."
|
271 |
+
assert not z.isinf().any(), "Zero point tensor contains Inf."
|
272 |
+
# endregion
|
273 |
+
return s, z
|