Update README.md
Browse files
README.md
CHANGED
|
@@ -338,7 +338,7 @@ Note: you can change the number of prompts to be benchmarked with `--num-prompts
|
|
| 338 |
Server:
|
| 339 |
```Shell
|
| 340 |
export MODEL=Qwen/Qwen3-8B
|
| 341 |
-
vllm serve $MODEL --tokenizer
|
| 342 |
```
|
| 343 |
|
| 344 |
Client:
|
|
@@ -351,7 +351,7 @@ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --
|
|
| 351 |
Server:
|
| 352 |
```Shell
|
| 353 |
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
| 354 |
-
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer
|
| 355 |
```
|
| 356 |
|
| 357 |
Client:
|
|
|
|
| 338 |
Server:
|
| 339 |
```Shell
|
| 340 |
export MODEL=Qwen/Qwen3-8B
|
| 341 |
+
vllm serve $MODEL --tokenizer $MODEL -O3
|
| 342 |
```
|
| 343 |
|
| 344 |
Client:
|
|
|
|
| 351 |
Server:
|
| 352 |
```Shell
|
| 353 |
export MODEL=pytorch/Qwen3-8B-int4wo-hqq
|
| 354 |
+
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
| 355 |
```
|
| 356 |
|
| 357 |
Client:
|