from llama_cpp import Llama import time # Load the quantized GGUF model (make sure this path is correct) MODEL_PATH = "./models/deepseek-llm-7b-chat-Q6_K.gguf" # Load model into GPU with optimal settings llm = Llama( model_path=MODEL_PATH, n_ctx=4096, # Higher context for memory n_gpu_layers=45, # Try full GPU usage for RTX 4060 (adjust if needed) n_threads=8, # For quick CPU fallback (depends on your CPU) low_vram=False, # Use full GPU use_mlock=True, # Lock model in memory (if supported) verbose=False # Silence logs ) # Chat loop with memory conversation = [ "System: You are a highly intelligent and helpful assistant. Answer questions accurately and clearly." ] print("🧠 Live AI Chat Ready (type 'exit' to quit)\n") while True: user_input = input("You: ").strip() if user_input.lower() in ["exit", "quit"]: break conversation.append(f"User: {user_input}") prompt = "\n".join(conversation) + "\nAssistant:" # Streaming response print("Assistant: ", end="", flush=True) output = "" start = time.time() for chunk in llm( prompt, max_tokens=9024, temperature=0.4, top_p=0.9, top_k=20, repeat_penalty=1.1, stop=["User:", "System:", "Assistant:"], stream=True ): token = chunk["choices"][0]["text"] print(token, end="", flush=True) output += token print(f"\⏱️ Response time: {round(time.time() - start, 2)}s\n") conversation.append(f"Assistant: {output.strip()}")