handler.py · ETI-Deploy/DM-BaseModel-4Bit at main

DM-BaseModel-4Bit / handler.py

Upload folder using huggingface_hub

94152ca verified 3 months ago

4.57 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline


	class EndpointHandler:
	def __init__(self, model_path="djangodevloper/llama3-70b-4bit-medqa"):
	try:
	self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
	self.model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	trust_remote_code=True
	)
	self.pipeline = TextGenerationPipeline(
	model=self.model,
	tokenizer=self.tokenizer,
	)
	except Exception as e:
	raise RuntimeError(f"Failed to initialize model or tokenizer: {e}")

	# PROMPT FOR GENERAL USERS
	self.general_prompt = (
	"You are DoctusMind, a trustworthy and friendly medical AI assistant. "
	"Provide clear, easy-to-understand, and medically accurate answers to everyday health questions. "
	"Use simple language and suggest safe, evidence-informed home remedies when suitable. "
	"Be supportive and avoid technical jargon. Prioritize safety and clarity. "
	"If asked a non-medical question, politely respond with:\n"
	"`{\"not_medical_question\": true}`\n"
	"Format responses with bullet points, headers, or short paragraphs when helpful."
	)

	# PROMPT FOR PROFESSIONAL USERS
	self.professional_prompt = (
	"You are DoctusMind, a highly competent and articulate medical AI assistant for healthcare professionals. "
	"Provide concise, medically rigorous responses using appropriate clinical terminology, diagnostic language, "
	"and pathophysiological reasoning. Reference guidelines (e.g., WHO, CDC, NICE) where relevant. "
	"Always maintain a professional tone and format responses for quick clinical comprehension. "
	"If asked a non-medical question, reply with:\n"
	"`{\"not_medical_question\": true}`"
	)

	# PROMPT FOR CONVERSATION SUMMARY
	self.summary_prompt = (
	"Update the user’s running chat summary by incorporating the most recent messages. "
	"Preserve important context like health conditions, preferences, personal facts, "
	"or constraints. Keep the summary compact and in User: ...\\nBot: ... format. "
	"Omit small talk unless relevant."
	)

	# PROMPT FOR CONVERSATION HEADER
	self.header_prompt = (
	"Generate a short and meaningful header (max 50 characters) based on the conversation."
	)

	def __call__(self, data):
	try:
	user_input = data.get("inputs", "")
	user_type = data.get("user_type", "general").strip().lower()
	mode = data.get("mode", "chat").strip().lower()

	if not user_input:
	return {"error": "Missing 'inputs' in request."}

	# Pick system prompt
	if mode == "summary":
	system_prompt = self.summary_prompt
	elif mode == "header":
	system_prompt = self.header_prompt
	else:
	system_prompt = self.professional_prompt if user_type == "professional" else self.general_prompt

	# Compose prompt (remove unnecessary newlines)
	full_prompt = f"<\|system\|>{system_prompt}<\|user\|>{user_input}<\|assistant\|>"

	# Generate
	outputs = self.pipeline(
	full_prompt,
	max_new_tokens=600, # Reduced to 600 for latency, still enough for full answers
	temperature=0.1, # Low = faster and focused
	top_k=50,
	top_p=0.9,
	repetition_penalty=1.05,
	do_sample=False, # Deterministic = less decoding time
	eos_token_id=[
	self.tokenizer.eos_token_id,
	self.tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	]
	)

	# Extract
	generated_text = outputs[0]["generated_text"]
	response = generated_text.split("<\|assistant\|>")[-1].strip()

	# Fallback if empty
	if not response:
	response = "Sorry, I couldn't generate a complete response. Try rephrasing."

	return {"generated_text": response}

	except Exception as e:
	return {"error": f"Inference error: {str(e)}"}