Update README.md
Browse files
README.md
CHANGED
@@ -38,36 +38,6 @@ python3 quantize_quark.py --model_dir "meta-llama/Meta-Llama-3.1-405B-Instruct"
|
|
38 |
|
39 |
# Deployment
|
40 |
|
41 |
-
## Use with vLLM
|
42 |
-
|
43 |
-
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
44 |
-
|
45 |
-
```python
|
46 |
-
from vllm import LLM, SamplingParams
|
47 |
-
from transformers import AutoTokenizer
|
48 |
-
|
49 |
-
model_id = "amd/Llama-3.1-405B-Instruct-MXFP4-Preview"
|
50 |
-
number_gpus = 8
|
51 |
-
|
52 |
-
sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)
|
53 |
-
|
54 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
55 |
-
|
56 |
-
messages = [
|
57 |
-
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
|
58 |
-
{"role": "user", "content": "Who are you?"},
|
59 |
-
]
|
60 |
-
|
61 |
-
prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
62 |
-
|
63 |
-
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=4096)
|
64 |
-
|
65 |
-
outputs = llm.generate(prompts, sampling_params)
|
66 |
-
|
67 |
-
generated_text = outputs[0].outputs[0].text
|
68 |
-
print(generated_text)
|
69 |
-
```
|
70 |
-
|
71 |
## Evaluation
|
72 |
|
73 |
The model was evaluated on MMLU and GSM8K_COT.
|
|
|
38 |
|
39 |
# Deployment
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
## Evaluation
|
42 |
|
43 |
The model was evaluated on MMLU and GSM8K_COT.
|