File size: 1,685 Bytes
ba18ff2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from llama_cpp import Llama
import time
# Load the quantized GGUF model (make sure this path is correct)
MODEL_PATH = "./models/deepseek-llm-7b-chat-Q6_K.gguf"
# Load model into GPU with optimal settings
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096, # Higher context for memory
n_gpu_layers=45, # Try full GPU usage for RTX 4060 (adjust if needed)
n_threads=8, # For quick CPU fallback (depends on your CPU)
low_vram=False, # Use full GPU
use_mlock=True, # Lock model in memory (if supported)
verbose=False # Silence logs
)
# Chat loop with memory
conversation = [
"System: You are a highly intelligent and helpful assistant. Answer questions accurately and clearly."
]
print("🧠 Live AI Chat Ready (type 'exit' to quit)\n")
while True:
user_input = input("You: ").strip()
if user_input.lower() in ["exit", "quit"]:
break
conversation.append(f"User: {user_input}")
prompt = "\n".join(conversation) + "\nAssistant:"
# Streaming response
print("Assistant: ", end="", flush=True)
output = ""
start = time.time()
for chunk in llm(
prompt,
max_tokens=9024,
temperature=0.4,
top_p=0.9,
top_k=20,
repeat_penalty=1.1,
stop=["User:", "System:", "Assistant:"],
stream=True
):
token = chunk["choices"][0]["text"]
print(token, end="", flush=True)
output += token
print(f"\⏱️ Response time: {round(time.time() - start, 2)}s\n")
conversation.append(f"Assistant: {output.strip()}")
|