Update README.md
Browse files
README.md
CHANGED
@@ -39,7 +39,6 @@ Once your server is started, you can query the model using the OpenAI API:
|
|
39 |
```python
|
40 |
from openai import OpenAI
|
41 |
|
42 |
-
# Modify OpenAI's API key and API base to use vLLM's API server.
|
43 |
openai_api_key = "EMPTY"
|
44 |
openai_api_base = "http://localhost:8000/v1"
|
45 |
client = OpenAI(
|
@@ -62,7 +61,7 @@ And there's more. You can run 2:4 sparse models on vLLM and get significant spee
|
|
62 |
prompt = f"Give a TL;DR of the following Reddit post.\n<|user|>{post}\nTL;DR:\n<|assistant|>\n"
|
63 |
|
64 |
completion = client.completions.create(
|
65 |
-
model="RedHatAI/Llama-3.1-8B-tldr",
|
66 |
prompt=prompt,
|
67 |
max_tokens=256,
|
68 |
)
|
|
|
39 |
```python
|
40 |
from openai import OpenAI
|
41 |
|
|
|
42 |
openai_api_key = "EMPTY"
|
43 |
openai_api_base = "http://localhost:8000/v1"
|
44 |
client = OpenAI(
|
|
|
61 |
prompt = f"Give a TL;DR of the following Reddit post.\n<|user|>{post}\nTL;DR:\n<|assistant|>\n"
|
62 |
|
63 |
completion = client.completions.create(
|
64 |
+
model="RedHatAI/Llama-3.1-8B-tldr-FP8-dynamic",
|
65 |
prompt=prompt,
|
66 |
max_tokens=256,
|
67 |
)
|