ankitkushwaha90's picture
Upload 22 files
ba18ff2 verified
from llama_cpp import Llama
import time
# Load the quantized GGUF model (make sure this path is correct)
MODEL_PATH = "./models/deepseek-llm-7b-chat-Q6_K.gguf"
# Load model into GPU with optimal settings
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096, # Higher context for memory
n_gpu_layers=45, # Try full GPU usage for RTX 4060 (adjust if needed)
n_threads=8, # For quick CPU fallback (depends on your CPU)
low_vram=False, # Use full GPU
use_mlock=True, # Lock model in memory (if supported)
verbose=False # Silence logs
)
# Chat loop with memory
conversation = [
"System: You are a highly intelligent and helpful assistant. Answer questions accurately and clearly."
]
print("🧠 Live AI Chat Ready (type 'exit' to quit)\n")
while True:
user_input = input("You: ").strip()
if user_input.lower() in ["exit", "quit"]:
break
conversation.append(f"User: {user_input}")
prompt = "\n".join(conversation) + "\nAssistant:"
# Streaming response
print("Assistant: ", end="", flush=True)
output = ""
start = time.time()
for chunk in llm(
prompt,
max_tokens=9024,
temperature=0.4,
top_p=0.9,
top_k=20,
repeat_penalty=1.1,
stop=["User:", "System:", "Assistant:"],
stream=True
):
token = chunk["choices"][0]["text"]
print(token, end="", flush=True)
output += token
print(f"\⏱️ Response time: {round(time.time() - start, 2)}s\n")
conversation.append(f"Assistant: {output.strip()}")