jerryzh168 commited on
Commit
e3c16b0
·
verified ·
1 Parent(s): 8004dfc

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +12 -6
README.md CHANGED
@@ -307,12 +307,14 @@ Run the benchmarks under `vllm` root folder:
307
 
308
  ### baseline
309
  ```Shell
310
- python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model microsoft/Phi-4-mini-instruct --batch-size 1
 
311
  ```
312
 
313
  ### int4wo-hqq
314
  ```Shell
315
- VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model pytorch/Phi-4-mini-instruct-int4wo-hqq --batch-size 1
 
316
  ```
317
 
318
  ## benchmark_serving
@@ -334,23 +336,27 @@ Note: you can change the number of prompts to be benchmarked with `--num-prompts
334
  ### baseline
335
  Server:
336
  ```Shell
337
- vllm serve microsoft/Phi-4-mini-instruct --tokenizer microsoft/Phi-4-mini-instruct -O3
 
338
  ```
339
 
340
  Client:
341
  ```Shell
342
- python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer microsoft/Phi-4-mini-instruct --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model microsoft/Phi-4-mini-instruct --num-prompts 1
 
343
  ```
344
 
345
  ### int4wo-hqq
346
  Server:
347
  ```Shell
348
- VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Phi-4-mini-instruct-int4wo-hqq --tokenizer microsoft/Phi-4-mini-instruct -O3 --pt-load-map-location cuda:0
 
349
  ```
350
 
351
  Client:
352
  ```Shell
353
- python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer microsoft/Phi-4-mini-instruct --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model pytorch/Phi-4-mini-instruct-int4wo-hqq --num-prompts 1
 
354
  ```
355
 
356
 
 
307
 
308
  ### baseline
309
  ```Shell
310
+ export MODEL=Qwen/Qwen3-8B
311
+ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
312
  ```
313
 
314
  ### int4wo-hqq
315
  ```Shell
316
+ export MODEL=pytorch/Qwen3-8B-int4wo-hqq
317
+ VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
318
  ```
319
 
320
  ## benchmark_serving
 
336
  ### baseline
337
  Server:
338
  ```Shell
339
+ export MODEL=Qwen/Qwen3-8B
340
+ vllm serve $MODEL --tokenizer microsoft/Phi-4-mini-instruct -O3
341
  ```
342
 
343
  Client:
344
  ```Shell
345
+ export MODEL=Qwen/Qwen3-8B
346
+ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
347
  ```
348
 
349
  ### int4wo-hqq
350
  Server:
351
  ```Shell
352
+ export MODEL=pytorch/Qwen3-8B-int4wo-hqq
353
+ VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer microsoft/Phi-4-mini-instruct -O3 --pt-load-map-location cuda:0
354
  ```
355
 
356
  Client:
357
  ```Shell
358
+ export MODEL=pytorch/Qwen3-8B-int4wo-hqq
359
+ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
360
  ```
361
 
362