Update README.md
Browse files
README.md
CHANGED
@@ -236,6 +236,8 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
236 |
| latency (batch_size=1) | 8.93s | 5.16s (1.73x speedup) |
|
237 |
| latency (batch_size=256) | 33.85s | 16.15s (2.10x speedup) |
|
238 |
|
|
|
|
|
239 |
<details>
|
240 |
<summary> Reproduce latency benchmarks </summary>
|
241 |
|
@@ -245,8 +247,13 @@ git clone [email protected]:vllm-project/vllm.git
|
|
245 |
cd vllm
|
246 |
VLLM_USE_PRECOMPILED=1 pip install --editable .
|
247 |
```
|
248 |
-
|
|
|
|
|
|
|
249 |
**2. Latency benchmarking**
|
|
|
|
|
250 |
```Shell
|
251 |
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
|
252 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
|
|
236 |
| latency (batch_size=1) | 8.93s | 5.16s (1.73x speedup) |
|
237 |
| latency (batch_size=256) | 33.85s | 16.15s (2.10x speedup) |
|
238 |
|
239 |
+
Note: tested with `fbgemm-gpu-genai` installed.
|
240 |
+
|
241 |
<details>
|
242 |
<summary> Reproduce latency benchmarks </summary>
|
243 |
|
|
|
247 |
cd vllm
|
248 |
VLLM_USE_PRECOMPILED=1 pip install --editable .
|
249 |
```
|
250 |
+
To use fbgemm kernels:
|
251 |
+
```Shell
|
252 |
+
pip install fbgemm-gpu-genai
|
253 |
+
```
|
254 |
**2. Latency benchmarking**
|
255 |
+
|
256 |
+
|
257 |
```Shell
|
258 |
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
|
259 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|