blip2-image-to-text / handler.py
thoth-AI's picture
First push of custom handler for Blip2 model to be used in Inference API
db9328f
raw
history blame
1.72 kB
from typing import Dict, List, Any
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
from io import BytesIO
import torch
import os
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler:
def __init__(self, path=""):
# load the optimized model
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")
self.model.eval()
self.model = self.model.to("cuda")
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
raw_images = inputs
processed_image = self.processor(images=raw_images, return_tensors="pt").to(device)
processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
processed_image = {**processed_image, **parameters}
with torch.no_grad():
out = self.model.generate(
**processed_image
)
captions = self.processor.batch_decode(out, skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}