Update README.md

Browse files

Files changed (1) hide show

README.md +48 -1

README.md CHANGED Viewed

@@ -25,6 +25,48 @@ pip install git+https://github.com/huggingface/transformers@main
 pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 ```
 We used following code to get the quantized model:
 ```
@@ -43,7 +85,12 @@ from torchao.quantization.quant_api import (
 from torchao.quantization.granularity import PerGroup, PerAxis
 import torch
 model_id = "microsoft/Phi-4-mini-instruct"
 embedding_config = IntxWeightOnlyConfig(
     weight_dtype=torch.int8,
@@ -54,7 +101,7 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
     weight_granularity=PerGroup(32),
     weight_scale_dtype=torch.bfloat16,
 )
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 # TODO: use AOPerModuleConfig once fix for tied weights is landed

 pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 ```
+## Untie Embedding Weights
+Before quantization, since we need quantize input embedding and unembedding (lm_head) layer which are tied, we first need to untie the model:
+```
+from transformers import (
+  AutoModelForCausalLM,
+  AutoProcessor,
+  AutoTokenizer,
+  TorchAoConfig,
+)
+from torchao.quantization.quant_api import (
+    IntxWeightOnlyConfig,
+    Int8DynamicActivationIntxWeightConfig,
+    AOPerModuleConfig
+)
+from torchao.quantization.granularity import PerGroup, PerAxis
+import torch
+model_id = "microsoft/Phi-4-mini-instruct"
+untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="a\uto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+print(untied_model)
+from transformers.modeling_utils import find_tied_parameters
+print("tied weights:", find_tied_parameters(untied_model))
+if getattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings"):
+    setattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings", False)
+untied_model._tied_weights_keys = []
+untied_model.lm_head.weight = torch.nn.Parameter(quantized_model.lm_head.weight.clone())
+print("tied weights:", find_tied_parameters(untied_model))
+USER_ID = "YOUR_USER_ID"
+MODEL_NAME = model_id.split("/")[-1]
+save_to = f"{USER_ID}/{MODEL_NAME}-untied-weights"
+untied_model.push_to_hub(save_to)
+tokenizer.push_to_hub(save_to)
+```
+## Quantization
 We used following code to get the quantized model:
 ```
 from torchao.quantization.granularity import PerGroup, PerAxis
 import torch
+# we start from the model with untied weights
 model_id = "microsoft/Phi-4-mini-instruct"
+USER_ID = "YOUR_USER_ID"
+MODEL_NAME = model_id.split("/")[-1]
+untied_model_id = f"{USER_ID}/{MODEL_NAME}-untied-weights"
 embedding_config = IntxWeightOnlyConfig(
     weight_dtype=torch.int8,
     weight_granularity=PerGroup(32),
     weight_scale_dtype=torch.bfloat16,
 )
+quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 # TODO: use AOPerModuleConfig once fix for tied weights is landed