import os import numpy as np from typing import Dict, List, Any from PIL import Image, ImageDraw, ImageFont from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification from transformers import pipeline, AutoTokenizer os.system('pip3 install pycocotools') os.system('pip3 install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html') import detectron2 print(f"DETECTRON2 {detectron2.__version__}") class EndpointHandler(): def __init__(self, path=""): # load the processor and model self.processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") self.model = LayoutLMv2ForTokenClassification.from_pretrained("nielsr/layoutlmv2-finetuned-funsd") self.id2label = { 0: 'O', 1: 'B-HEADER', 2: 'I-HEADER', 3: 'B-QUESTION', 4: 'I-QUESTION', 5: 'B-ANSWER', 6: 'I-ANSWER' } def __call__(self, data: Any) -> List[List[Dict[str, float]]]: """ Args: data (:obj:): includes the input data and the parameters for the inference. Return: A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing : - "label": A string representing what the label/class is. There can be multiple labels. - "score": A score between 0 and 1 describing how confident the model is for this label/class. """ def unnormalize_box(bbox, width, height): return [ width * (bbox[0] / 1000), height * (bbox[1] / 1000), width * (bbox[2] / 1000), height * (bbox[3] / 1000), ] image = data.pop("inputs", data) # encode encoding = self.processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt") offset_mapping = encoding.pop('offset_mapping') # forward pass outputs = self.model(**encoding) # get predictions predictions = outputs.logits.argmax(-1).squeeze().tolist() token_boxes = encoding.bbox.squeeze().tolist() # only keep non-subword predictions #is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0 width, height = image.size true_predictions = [self.id2label[prediction] for prediction in predictions] true_boxes = [unnormalize_box(box, width, height) for box in token_boxes] is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0 # postprocess the prediction return {"labels": true_predictions, "boxes": true_boxes, "is_subword": is_subword}