Donnyed commited on
Commit
53a13a1
·
verified ·
1 Parent(s): 2f4a460

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +73 -0
handler.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ import re
5
+
6
+ class EndpointHandler():
7
+ def __init__(self, path=""):
8
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
9
+ self.model = AutoModelForSequenceClassification.from_pretrained(path)
10
+ self.model.eval()
11
+ self.id2label = {0: "Human", 1: "Mixed", 2: "AI"}
12
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ self.model.to(self.device)
14
+
15
+ def split_into_sentences(self, text: str):
16
+ sentences = re.split(r'(?<=[.!?])\s+', text)
17
+ return [s.strip() for s in sentences if s.strip()]
18
+
19
+ def get_token_predictions(self, text: str):
20
+ tokens = self.tokenizer.tokenize(text)
21
+ token_predictions = []
22
+ for i in range(len(tokens)):
23
+ start = max(0, i - 10)
24
+ end = min(len(tokens), i + 10)
25
+ context = self.tokenizer.convert_tokens_to_string(tokens[start:end])
26
+ inputs = self.tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
27
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
28
+ with torch.no_grad():
29
+ outputs = self.model(**inputs)
30
+ probs = torch.softmax(outputs.logits, dim=1)
31
+ ai_prob = probs[0][2].item()
32
+ token = tokens[i].replace("Ġ", " ").replace("▁", " ").replace("Ċ", " ").strip()
33
+ if token:
34
+ token_predictions.append({"token": token, "ai_prob": ai_prob})
35
+ return token_predictions
36
+
37
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
38
+ text = data.get("inputs", "")
39
+ # Document level
40
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
41
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
42
+ with torch.no_grad():
43
+ outputs = self.model(**inputs)
44
+ probs = torch.softmax(outputs.logits, dim=1)
45
+ pred = torch.argmax(probs, dim=1).item()
46
+ doc_result = {
47
+ "prediction": self.id2label[pred],
48
+ "confidence": probs[0][pred].item(),
49
+ "probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
50
+ }
51
+ # Sentence level
52
+ sentences = self.split_into_sentences(text)
53
+ sent_results = []
54
+ for sent in sentences:
55
+ inputs = self.tokenizer(sent, return_tensors="pt", truncation=True, max_length=512)
56
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
57
+ with torch.no_grad():
58
+ outputs = self.model(**inputs)
59
+ probs = torch.softmax(outputs.logits, dim=1)
60
+ pred = torch.argmax(probs, dim=1).item()
61
+ sent_results.append({
62
+ "sentence": sent,
63
+ "prediction": self.id2label[pred],
64
+ "confidence": probs[0][pred].item(),
65
+ "probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
66
+ })
67
+ # Token level
68
+ token_results = self.get_token_predictions(text)
69
+ return {
70
+ "document": doc_result,
71
+ "sentences": sent_results,
72
+ "tokens": token_results
73
+ }