from llama_cpp import Llama
import time

# Load the quantized GGUF model (make sure this path is correct)
MODEL_PATH = "./models/deepseek-llm-7b-chat-Q6_K.gguf"

# Load model into GPU with optimal settings
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,               # Higher context for memory
    n_gpu_layers=45,          # Try full GPU usage for RTX 4060 (adjust if needed)
    n_threads=8,              # For quick CPU fallback (depends on your CPU)
    low_vram=False,           # Use full GPU
    use_mlock=True,           # Lock model in memory (if supported)
    verbose=False             # Silence logs
)

# Chat loop with memory
conversation = [
    "System: You are a highly intelligent and helpful assistant. Answer questions accurately and clearly."
]

print("🧠 Live AI Chat Ready (type 'exit' to quit)\n")

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        break

    conversation.append(f"User: {user_input}")
    prompt = "\n".join(conversation) + "\nAssistant:"

    # Streaming response
    print("Assistant: ", end="", flush=True)

    output = ""
    start = time.time()

    for chunk in llm(
        prompt,
        max_tokens=9024,
        temperature=0.4,
        top_p=0.9,
        top_k=20,
        repeat_penalty=1.1,
        stop=["User:", "System:", "Assistant:"],
        stream=True
    ):
        token = chunk["choices"][0]["text"]
        print(token, end="", flush=True)
        output += token

    print(f"\⏱️ Response time: {round(time.time() - start, 2)}s\n")

    conversation.append(f"Assistant: {output.strip()}")