Update README.md
Browse files
README.md
CHANGED
|
@@ -307,12 +307,14 @@ Run the benchmarks under `vllm` root folder:
|
|
| 307 |
|
| 308 |
### baseline
|
| 309 |
```Shell
|
| 310 |
-
|
|
|
|
| 311 |
```
|
| 312 |
|
| 313 |
### int4wo-hqq
|
| 314 |
```Shell
|
| 315 |
-
|
|
|
|
| 316 |
```
|
| 317 |
|
| 318 |
## benchmark_serving
|
|
@@ -334,23 +336,27 @@ Note: you can change the number of prompts to be benchmarked with `--num-prompts
|
|
| 334 |
### baseline
|
| 335 |
Server:
|
| 336 |
```Shell
|
| 337 |
-
|
|
|
|
| 338 |
```
|
| 339 |
|
| 340 |
Client:
|
| 341 |
```Shell
|
| 342 |
-
|
|
|
|
| 343 |
```
|
| 344 |
|
| 345 |
### int4wo-hqq
|
| 346 |
Server:
|
| 347 |
```Shell
|
| 348 |
-
|
|
|
|
| 349 |
```
|
| 350 |
|
| 351 |
Client:
|
| 352 |
```Shell
|
| 353 |
-
|
|
|
|
| 354 |
```
|
| 355 |
|
| 356 |
|
|
|
|
| 307 |
|
| 308 |
### baseline
|
| 309 |
```Shell
|
| 310 |
+
export MODEL=Qwen/Qwen3-8B
|
| 311 |
+
python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 312 |
```
|
| 313 |
|
| 314 |
### int4wo-hqq
|
| 315 |
```Shell
|
| 316 |
+
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
| 317 |
+
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 318 |
```
|
| 319 |
|
| 320 |
## benchmark_serving
|
|
|
|
| 336 |
### baseline
|
| 337 |
Server:
|
| 338 |
```Shell
|
| 339 |
+
export MODEL=Qwen/Qwen3-8B
|
| 340 |
+
vllm serve $MODEL --tokenizer microsoft/Phi-4-mini-instruct -O3
|
| 341 |
```
|
| 342 |
|
| 343 |
Client:
|
| 344 |
```Shell
|
| 345 |
+
export MODEL=Qwen/Qwen3-8B
|
| 346 |
+
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 347 |
```
|
| 348 |
|
| 349 |
### int4wo-hqq
|
| 350 |
Server:
|
| 351 |
```Shell
|
| 352 |
+
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
| 353 |
+
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer microsoft/Phi-4-mini-instruct -O3 --pt-load-map-location cuda:0
|
| 354 |
```
|
| 355 |
|
| 356 |
Client:
|
| 357 |
```Shell
|
| 358 |
+
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
| 359 |
+
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 360 |
```
|
| 361 |
|
| 362 |
|