nbroad commited on
Commit
9ba0b20
·
1 Parent(s): a20930b

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +65 -0
handler.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from typing import Any, Dict, List
5
+
6
+
7
+ # copied from the model card
8
+ def mean_pooling(model_output, attention_mask):
9
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
10
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
11
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
12
+
13
+ class EndpointHandler():
14
+ def __init__(self, path="./"):
15
+
16
+ # load the optimized model
17
+ self.model = AutoModel.from_pretrained(
18
+ path,
19
+ )
20
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
21
+ # create inference pipeline
22
+
23
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+
25
+ self.model.to(self.device)
26
+
27
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
28
+ """
29
+ Args:
30
+ data (:obj:):
31
+ includes the input data and the parameters for the inference.
32
+ Return:
33
+ A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
34
+ - "label": A string representing what the label/class is. There can be multiple labels.
35
+ - "score": A score between 0 and 1 describing how confident the model is for this label/class.
36
+ """
37
+ inputs = data.pop("inputs", data)
38
+ parameters = data.pop("parameters", None)
39
+
40
+ with torch.inference_mode():
41
+
42
+ if parameters is None:
43
+ max_length = None
44
+ else:
45
+ max_length = parameters.pop("max_length", 512)
46
+
47
+ inputs = self.tokenizer(
48
+ inputs,
49
+ padding=True,
50
+ truncation=True,
51
+ return_tensors='pt',
52
+ max_length=max_length,
53
+ ).to(self.device)
54
+
55
+ model_output = self.model(**inputs)
56
+
57
+ sentence_embeddings = mean_pooling(model_output, inputs['attention_mask'])
58
+
59
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
60
+
61
+
62
+ # postprocess the prediction
63
+ return {
64
+ "embeddings": sentence_embeddings.cpu().tolist()
65
+ }