Update README.md
Browse files
README.md
CHANGED
|
@@ -25,6 +25,48 @@ pip install git+https://github.com/huggingface/transformers@main
|
|
| 25 |
pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
| 26 |
```
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
We used following code to get the quantized model:
|
| 29 |
|
| 30 |
```
|
|
@@ -43,7 +85,12 @@ from torchao.quantization.quant_api import (
|
|
| 43 |
from torchao.quantization.granularity import PerGroup, PerAxis
|
| 44 |
import torch
|
| 45 |
|
|
|
|
| 46 |
model_id = "microsoft/Phi-4-mini-instruct"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
embedding_config = IntxWeightOnlyConfig(
|
| 49 |
weight_dtype=torch.int8,
|
|
@@ -54,7 +101,7 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
|
|
| 54 |
weight_granularity=PerGroup(32),
|
| 55 |
weight_scale_dtype=torch.bfloat16,
|
| 56 |
)
|
| 57 |
-
quantized_model = AutoModelForCausalLM.from_pretrained(
|
| 58 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 59 |
|
| 60 |
# TODO: use AOPerModuleConfig once fix for tied weights is landed
|
|
|
|
| 25 |
pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
| 26 |
```
|
| 27 |
|
| 28 |
+
## Untie Embedding Weights
|
| 29 |
+
Before quantization, since we need quantize input embedding and unembedding (lm_head) layer which are tied, we first need to untie the model:
|
| 30 |
+
|
| 31 |
+
```
|
| 32 |
+
from transformers import (
|
| 33 |
+
AutoModelForCausalLM,
|
| 34 |
+
AutoProcessor,
|
| 35 |
+
AutoTokenizer,
|
| 36 |
+
TorchAoConfig,
|
| 37 |
+
)
|
| 38 |
+
from torchao.quantization.quant_api import (
|
| 39 |
+
IntxWeightOnlyConfig,
|
| 40 |
+
Int8DynamicActivationIntxWeightConfig,
|
| 41 |
+
AOPerModuleConfig
|
| 42 |
+
)
|
| 43 |
+
from torchao.quantization.granularity import PerGroup, PerAxis
|
| 44 |
+
import torch
|
| 45 |
+
|
| 46 |
+
model_id = "microsoft/Phi-4-mini-instruct"
|
| 47 |
+
untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="a\uto")
|
| 48 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 49 |
+
|
| 50 |
+
print(untied_model)
|
| 51 |
+
from transformers.modeling_utils import find_tied_parameters
|
| 52 |
+
print("tied weights:", find_tied_parameters(untied_model))
|
| 53 |
+
if getattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings"):
|
| 54 |
+
setattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings", False)
|
| 55 |
+
|
| 56 |
+
untied_model._tied_weights_keys = []
|
| 57 |
+
untied_model.lm_head.weight = torch.nn.Parameter(quantized_model.lm_head.weight.clone())
|
| 58 |
+
|
| 59 |
+
print("tied weights:", find_tied_parameters(untied_model))
|
| 60 |
+
|
| 61 |
+
USER_ID = "YOUR_USER_ID"
|
| 62 |
+
MODEL_NAME = model_id.split("/")[-1]
|
| 63 |
+
save_to = f"{USER_ID}/{MODEL_NAME}-untied-weights"
|
| 64 |
+
untied_model.push_to_hub(save_to)
|
| 65 |
+
tokenizer.push_to_hub(save_to)
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Quantization
|
| 69 |
+
|
| 70 |
We used following code to get the quantized model:
|
| 71 |
|
| 72 |
```
|
|
|
|
| 85 |
from torchao.quantization.granularity import PerGroup, PerAxis
|
| 86 |
import torch
|
| 87 |
|
| 88 |
+
# we start from the model with untied weights
|
| 89 |
model_id = "microsoft/Phi-4-mini-instruct"
|
| 90 |
+
USER_ID = "YOUR_USER_ID"
|
| 91 |
+
MODEL_NAME = model_id.split("/")[-1]
|
| 92 |
+
untied_model_id = f"{USER_ID}/{MODEL_NAME}-untied-weights"
|
| 93 |
+
|
| 94 |
|
| 95 |
embedding_config = IntxWeightOnlyConfig(
|
| 96 |
weight_dtype=torch.int8,
|
|
|
|
| 101 |
weight_granularity=PerGroup(32),
|
| 102 |
weight_scale_dtype=torch.bfloat16,
|
| 103 |
)
|
| 104 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto")
|
| 105 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 106 |
|
| 107 |
# TODO: use AOPerModuleConfig once fix for tied weights is landed
|