Update README.md
Browse files
README.md
CHANGED
@@ -307,12 +307,14 @@ Run the benchmarks under `vllm` root folder:
|
|
307 |
|
308 |
### baseline
|
309 |
```Shell
|
310 |
-
|
|
|
311 |
```
|
312 |
|
313 |
### int4wo-hqq
|
314 |
```Shell
|
315 |
-
|
|
|
316 |
```
|
317 |
|
318 |
## benchmark_serving
|
@@ -334,23 +336,27 @@ Note: you can change the number of prompts to be benchmarked with `--num-prompts
|
|
334 |
### baseline
|
335 |
Server:
|
336 |
```Shell
|
337 |
-
|
|
|
338 |
```
|
339 |
|
340 |
Client:
|
341 |
```Shell
|
342 |
-
|
|
|
343 |
```
|
344 |
|
345 |
### int4wo-hqq
|
346 |
Server:
|
347 |
```Shell
|
348 |
-
|
|
|
349 |
```
|
350 |
|
351 |
Client:
|
352 |
```Shell
|
353 |
-
|
|
|
354 |
```
|
355 |
|
356 |
|
|
|
307 |
|
308 |
### baseline
|
309 |
```Shell
|
310 |
+
export MODEL=Qwen/Qwen3-8B
|
311 |
+
python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
312 |
```
|
313 |
|
314 |
### int4wo-hqq
|
315 |
```Shell
|
316 |
+
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
317 |
+
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
318 |
```
|
319 |
|
320 |
## benchmark_serving
|
|
|
336 |
### baseline
|
337 |
Server:
|
338 |
```Shell
|
339 |
+
export MODEL=Qwen/Qwen3-8B
|
340 |
+
vllm serve $MODEL --tokenizer microsoft/Phi-4-mini-instruct -O3
|
341 |
```
|
342 |
|
343 |
Client:
|
344 |
```Shell
|
345 |
+
export MODEL=Qwen/Qwen3-8B
|
346 |
+
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
347 |
```
|
348 |
|
349 |
### int4wo-hqq
|
350 |
Server:
|
351 |
```Shell
|
352 |
+
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
353 |
+
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer microsoft/Phi-4-mini-instruct -O3 --pt-load-map-location cuda:0
|
354 |
```
|
355 |
|
356 |
Client:
|
357 |
```Shell
|
358 |
+
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
359 |
+
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
360 |
```
|
361 |
|
362 |
|