jerryzh168 commited on
Commit
5c8218f
·
verified ·
1 Parent(s): 586ec20

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -1
README.md CHANGED
@@ -236,6 +236,8 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
236
  | latency (batch_size=1) | 8.93s | 5.16s (1.73x speedup) |
237
  | latency (batch_size=256) | 33.85s | 16.15s (2.10x speedup) |
238
 
 
 
239
  <details>
240
  <summary> Reproduce latency benchmarks </summary>
241
 
@@ -245,8 +247,13 @@ git clone [email protected]:vllm-project/vllm.git
245
  cd vllm
246
  VLLM_USE_PRECOMPILED=1 pip install --editable .
247
  ```
248
-
 
 
 
249
  **2. Latency benchmarking**
 
 
250
  ```Shell
251
  export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
252
  VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
 
236
  | latency (batch_size=1) | 8.93s | 5.16s (1.73x speedup) |
237
  | latency (batch_size=256) | 33.85s | 16.15s (2.10x speedup) |
238
 
239
+ Note: tested with `fbgemm-gpu-genai` installed.
240
+
241
  <details>
242
  <summary> Reproduce latency benchmarks </summary>
243
 
 
247
  cd vllm
248
  VLLM_USE_PRECOMPILED=1 pip install --editable .
249
  ```
250
+ To use fbgemm kernels:
251
+ ```Shell
252
+ pip install fbgemm-gpu-genai
253
+ ```
254
  **2. Latency benchmarking**
255
+
256
+
257
  ```Shell
258
  export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-FP8
259
  VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1