ViTCM-LLM / inference.py
Mark-CHAE's picture
Upload folder using huggingface_hub
8374b0f verified
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
from peft import PeftModel
from PIL import Image
import base64
import io
# Load model and tokenizer
@torch.no_grad()
def load_model():
"""Load the ViTCM_LLM model for Traditional Chinese Medicine Tongue diagnosis."""
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")
base_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2.5-VL-32B-Instruct",
torch_dtype=torch.float16,
device_map="auto"
)
model = PeftModel.from_pretrained(base_model, "Mark-CHAE/shezhen")
return model, tokenizer, processor
# Initialize model
model, tokenizer, processor = load_model()
def query(question: str, image: str) -> str:
"""
Analyze tongue image for Traditional Chinese Medicine diagnosis.
Args:
question: The question about the tongue image (e.g., "根据图片判断舌诊内容")
image: Base64 encoded image string
Returns:
The TCM diagnosis analysis of the tongue
"""
try:
# Decode base64 image
image_data = base64.b64decode(image)
image_pil = Image.open(io.BytesIO(image_data))
# Construct prompt for TCM tongue diagnosis
prompt = f"<|im_start|>user\n<image>\n{question}<|im_end|>\n<|im_start|>assistant\n"
# Process inputs
inputs = processor(
text=prompt,
images=image_pil,
return_tensors="pt"
)
# Generate response
outputs = model.generate(
**inputs,
max_length=512,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = response.split("<|im_start|>assistant")[-1].strip()
return answer
except Exception as e:
return f"Error processing request: {str(e)}"