DM-BaseModel-4Bit / handler.py
ETI-Deploy's picture
Upload folder using huggingface_hub
94152ca verified
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
class EndpointHandler:
def __init__(self, model_path="djangodevloper/llama3-70b-4bit-medqa"):
try:
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
self.pipeline = TextGenerationPipeline(
model=self.model,
tokenizer=self.tokenizer,
)
except Exception as e:
raise RuntimeError(f"Failed to initialize model or tokenizer: {e}")
# PROMPT FOR GENERAL USERS
self.general_prompt = (
"You are DoctusMind, a trustworthy and friendly medical AI assistant. "
"Provide clear, easy-to-understand, and medically accurate answers to everyday health questions. "
"Use simple language and suggest safe, evidence-informed home remedies when suitable. "
"Be supportive and avoid technical jargon. Prioritize safety and clarity. "
"If asked a non-medical question, politely respond with:\n"
"`{\"not_medical_question\": true}`\n"
"Format responses with bullet points, headers, or short paragraphs when helpful."
)
# PROMPT FOR PROFESSIONAL USERS
self.professional_prompt = (
"You are DoctusMind, a highly competent and articulate medical AI assistant for healthcare professionals. "
"Provide concise, medically rigorous responses using appropriate clinical terminology, diagnostic language, "
"and pathophysiological reasoning. Reference guidelines (e.g., WHO, CDC, NICE) where relevant. "
"Always maintain a professional tone and format responses for quick clinical comprehension. "
"If asked a non-medical question, reply with:\n"
"`{\"not_medical_question\": true}`"
)
# PROMPT FOR CONVERSATION SUMMARY
self.summary_prompt = (
"Update the user’s running chat summary by incorporating the most recent messages. "
"Preserve important context like health conditions, preferences, personal facts, "
"or constraints. Keep the summary compact and in User: ...\\nBot: ... format. "
"Omit small talk unless relevant."
)
# PROMPT FOR CONVERSATION HEADER
self.header_prompt = (
"Generate a short and meaningful header (max 50 characters) based on the conversation."
)
def __call__(self, data):
try:
user_input = data.get("inputs", "")
user_type = data.get("user_type", "general").strip().lower()
mode = data.get("mode", "chat").strip().lower()
if not user_input:
return {"error": "Missing 'inputs' in request."}
# Pick system prompt
if mode == "summary":
system_prompt = self.summary_prompt
elif mode == "header":
system_prompt = self.header_prompt
else:
system_prompt = self.professional_prompt if user_type == "professional" else self.general_prompt
# Compose prompt (remove unnecessary newlines)
full_prompt = f"<|system|>{system_prompt}<|user|>{user_input}<|assistant|>"
# Generate
outputs = self.pipeline(
full_prompt,
max_new_tokens=600, # Reduced to 600 for latency, still enough for full answers
temperature=0.1, # Low = faster and focused
top_k=50,
top_p=0.9,
repetition_penalty=1.05,
do_sample=False, # Deterministic = less decoding time
eos_token_id=[
self.tokenizer.eos_token_id,
self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
)
# Extract
generated_text = outputs[0]["generated_text"]
response = generated_text.split("<|assistant|>")[-1].strip()
# Fallback if empty
if not response:
response = "Sorry, I couldn't generate a complete response. Try rephrasing."
return {"generated_text": response}
except Exception as e:
return {"error": f"Inference error: {str(e)}"}