|
from llama_cpp import Llama
|
|
import time
|
|
|
|
|
|
MODEL_PATH = "./models/deepseek-llm-7b-chat-Q6_K.gguf"
|
|
|
|
|
|
llm = Llama(
|
|
model_path=MODEL_PATH,
|
|
n_ctx=4096,
|
|
n_gpu_layers=45,
|
|
n_threads=8,
|
|
low_vram=False,
|
|
use_mlock=True,
|
|
verbose=False
|
|
)
|
|
|
|
|
|
conversation = [
|
|
"System: You are a highly intelligent and helpful assistant. Answer questions accurately and clearly."
|
|
]
|
|
|
|
print("🧠 Live AI Chat Ready (type 'exit' to quit)\n")
|
|
|
|
while True:
|
|
user_input = input("You: ").strip()
|
|
if user_input.lower() in ["exit", "quit"]:
|
|
break
|
|
|
|
conversation.append(f"User: {user_input}")
|
|
prompt = "\n".join(conversation) + "\nAssistant:"
|
|
|
|
|
|
print("Assistant: ", end="", flush=True)
|
|
|
|
output = ""
|
|
start = time.time()
|
|
|
|
for chunk in llm(
|
|
prompt,
|
|
max_tokens=9024,
|
|
temperature=0.4,
|
|
top_p=0.9,
|
|
top_k=20,
|
|
repeat_penalty=1.1,
|
|
stop=["User:", "System:", "Assistant:"],
|
|
stream=True
|
|
):
|
|
token = chunk["choices"][0]["text"]
|
|
print(token, end="", flush=True)
|
|
output += token
|
|
|
|
print(f"\⏱️ Response time: {round(time.time() - start, 2)}s\n")
|
|
|
|
conversation.append(f"Assistant: {output.strip()}")
|
|
|