Update README.md
Browse files
README.md
CHANGED
|
@@ -228,8 +228,8 @@ lm_eval --model hf --model_args pretrained=pytorch/Phi-4-mini-instruct-int4wo-hq
|
|
| 228 |
|
| 229 |
| Benchmark | | |
|
| 230 |
|------------------|----------------|--------------------------------|
|
| 231 |
-
| |
|
| 232 |
-
| Peak Memory (GB) |
|
| 233 |
|
| 234 |
|
| 235 |
## Code Example
|
|
@@ -240,8 +240,8 @@ We can use the following code to get a sense of peak memory usage during inferen
|
|
| 240 |
import torch
|
| 241 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 242 |
|
| 243 |
-
# use "
|
| 244 |
-
model_id = "pytorch/
|
| 245 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
| 246 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 247 |
|
|
|
|
| 228 |
|
| 229 |
| Benchmark | | |
|
| 230 |
|------------------|----------------|--------------------------------|
|
| 231 |
+
| | Qwen3-8B | Qwen3-8B-int4wo-hqq |
|
| 232 |
+
| Peak Memory (GB) | 6.41 | 6.27 (TODO% reduction) |
|
| 233 |
|
| 234 |
|
| 235 |
## Code Example
|
|
|
|
| 240 |
import torch
|
| 241 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 242 |
|
| 243 |
+
# use "Qwen/Qwen3-8B" or "pytorch/Qwen3-8B-int4wo-hqq"
|
| 244 |
+
model_id = "pytorch/Qwen3-8B-int4wo-hqq"
|
| 245 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
| 246 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 247 |
|