jerryzh168 commited on
Commit
68c3564
·
verified ·
1 Parent(s): d21ea4d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -4
README.md CHANGED
@@ -228,8 +228,8 @@ lm_eval --model hf --model_args pretrained=pytorch/Phi-4-mini-instruct-int4wo-hq
228
 
229
  | Benchmark | | |
230
  |------------------|----------------|--------------------------------|
231
- | | Phi-4 mini-Ins | Phi-4-mini-instruct-int4wo-hqq |
232
- | Peak Memory (GB) | 8.91 | 2.98 (67% reduction) |
233
 
234
 
235
  ## Code Example
@@ -240,8 +240,8 @@ We can use the following code to get a sense of peak memory usage during inferen
240
  import torch
241
  from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
242
 
243
- # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-int4wo-hqq"
244
- model_id = "pytorch/Phi-4-mini-instruct-int4wo-hqq"
245
  quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
246
  tokenizer = AutoTokenizer.from_pretrained(model_id)
247
 
 
228
 
229
  | Benchmark | | |
230
  |------------------|----------------|--------------------------------|
231
+ | | Qwen3-8B | Qwen3-8B-int4wo-hqq |
232
+ | Peak Memory (GB) | 6.41 | 6.27 (TODO% reduction) |
233
 
234
 
235
  ## Code Example
 
240
  import torch
241
  from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
242
 
243
+ # use "Qwen/Qwen3-8B" or "pytorch/Qwen3-8B-int4wo-hqq"
244
+ model_id = "pytorch/Qwen3-8B-int4wo-hqq"
245
  quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
246
  tokenizer = AutoTokenizer.from_pretrained(model_id)
247