File size: 2,870 Bytes
7573456
20bd5b3
1da1f10
20bd5b3
 
1da1f10
450dc54
12e327e
 
173c60d
5585b0a
7573456
5585b0a
1da1f10
 
 
20bd5b3
 
 
 
 
 
 
 
 
 
 
 
 
1da1f10
 
 
 
 
 
 
 
 
 
 
 
20bd5b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1da1f10
20bd5b3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import numpy as np
from typing import  Dict, List, Any
from PIL import Image, ImageDraw, ImageFont
from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
from transformers import pipeline, AutoTokenizer

os.system('pip3 install pycocotools')
os.system('pip3 install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html')

import detectron2

print(f"DETECTRON2 {detectron2.__version__}")

class EndpointHandler():
    def __init__(self, path=""):
        # load the processor and model

        self.processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
        self.model = LayoutLMv2ForTokenClassification.from_pretrained("nielsr/layoutlmv2-finetuned-funsd")   
        self.id2label = {
                        0: 'O',
                        1: 'B-HEADER',
                        2: 'I-HEADER',
                        3: 'B-QUESTION',
                        4: 'I-QUESTION',
                        5: 'B-ANSWER',
                        6: 'I-ANSWER'
            }


    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
                - "label": A string representing what the label/class is. There can be multiple labels.
                - "score": A score between 0 and 1 describing how confident the model is for this label/class.
        """

        def unnormalize_box(bbox, width, height):
            return [
                width * (bbox[0] / 1000),
                height * (bbox[1] / 1000),
                width * (bbox[2] / 1000),
                height * (bbox[3] / 1000),
            ]

        image = data.pop("inputs", data)
        # encode
        encoding = self.processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt")
        offset_mapping = encoding.pop('offset_mapping')

        # forward pass
        outputs = self.model(**encoding)

        # get predictions
        predictions = outputs.logits.argmax(-1).squeeze().tolist()
        token_boxes = encoding.bbox.squeeze().tolist()

        # only keep non-subword predictions
        #is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
        width, height = image.size

        true_predictions = [self.id2label[prediction]  for prediction in predictions]
        true_boxes = [unnormalize_box(box, width, height) for box in token_boxes]
        is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0

        # postprocess the prediction
        return {"labels": true_predictions, "boxes": true_boxes, "is_subword": is_subword}