mcp_using_gguf_model_in_terminal / deepseek-llm-7b-chat-Q6_K.py

Upload 22 files

ba18ff2 verified 21 days ago

1.69 kB

	from llama_cpp import Llama
	import time

	# Load the quantized GGUF model (make sure this path is correct)
	MODEL_PATH = "./models/deepseek-llm-7b-chat-Q6_K.gguf"

	# Load model into GPU with optimal settings
	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=4096, # Higher context for memory
	n_gpu_layers=45, # Try full GPU usage for RTX 4060 (adjust if needed)
	n_threads=8, # For quick CPU fallback (depends on your CPU)
	low_vram=False, # Use full GPU
	use_mlock=True, # Lock model in memory (if supported)
	verbose=False # Silence logs
	)

	# Chat loop with memory
	conversation = [
	"System: You are a highly intelligent and helpful assistant. Answer questions accurately and clearly."
	]

	print("🧠 Live AI Chat Ready (type 'exit' to quit)\n")

	while True:
	user_input = input("You: ").strip()
	if user_input.lower() in ["exit", "quit"]:
	break

	conversation.append(f"User: {user_input}")
	prompt = "\n".join(conversation) + "\nAssistant:"

	# Streaming response
	print("Assistant: ", end="", flush=True)

	output = ""
	start = time.time()

	for chunk in llm(
	prompt,
	max_tokens=9024,
	temperature=0.4,
	top_p=0.9,
	top_k=20,
	repeat_penalty=1.1,
	stop=["User:", "System:", "Assistant:"],
	stream=True
	):
	token = chunk["choices"][0]["text"]
	print(token, end="", flush=True)
	output += token

	print(f"\⏱️ Response time: {round(time.time() - start, 2)}s\n")

	conversation.append(f"Assistant: {output.strip()}")