Create handler.py
Browse files- handler.py +73 -0
handler.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
import torch
|
4 |
+
import re
|
5 |
+
|
6 |
+
class EndpointHandler():
|
7 |
+
def __init__(self, path=""):
|
8 |
+
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
9 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(path)
|
10 |
+
self.model.eval()
|
11 |
+
self.id2label = {0: "Human", 1: "Mixed", 2: "AI"}
|
12 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
13 |
+
self.model.to(self.device)
|
14 |
+
|
15 |
+
def split_into_sentences(self, text: str):
|
16 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
17 |
+
return [s.strip() for s in sentences if s.strip()]
|
18 |
+
|
19 |
+
def get_token_predictions(self, text: str):
|
20 |
+
tokens = self.tokenizer.tokenize(text)
|
21 |
+
token_predictions = []
|
22 |
+
for i in range(len(tokens)):
|
23 |
+
start = max(0, i - 10)
|
24 |
+
end = min(len(tokens), i + 10)
|
25 |
+
context = self.tokenizer.convert_tokens_to_string(tokens[start:end])
|
26 |
+
inputs = self.tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
|
27 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
28 |
+
with torch.no_grad():
|
29 |
+
outputs = self.model(**inputs)
|
30 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
31 |
+
ai_prob = probs[0][2].item()
|
32 |
+
token = tokens[i].replace("Ġ", " ").replace("▁", " ").replace("Ċ", " ").strip()
|
33 |
+
if token:
|
34 |
+
token_predictions.append({"token": token, "ai_prob": ai_prob})
|
35 |
+
return token_predictions
|
36 |
+
|
37 |
+
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
38 |
+
text = data.get("inputs", "")
|
39 |
+
# Document level
|
40 |
+
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
41 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
42 |
+
with torch.no_grad():
|
43 |
+
outputs = self.model(**inputs)
|
44 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
45 |
+
pred = torch.argmax(probs, dim=1).item()
|
46 |
+
doc_result = {
|
47 |
+
"prediction": self.id2label[pred],
|
48 |
+
"confidence": probs[0][pred].item(),
|
49 |
+
"probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
|
50 |
+
}
|
51 |
+
# Sentence level
|
52 |
+
sentences = self.split_into_sentences(text)
|
53 |
+
sent_results = []
|
54 |
+
for sent in sentences:
|
55 |
+
inputs = self.tokenizer(sent, return_tensors="pt", truncation=True, max_length=512)
|
56 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
57 |
+
with torch.no_grad():
|
58 |
+
outputs = self.model(**inputs)
|
59 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
60 |
+
pred = torch.argmax(probs, dim=1).item()
|
61 |
+
sent_results.append({
|
62 |
+
"sentence": sent,
|
63 |
+
"prediction": self.id2label[pred],
|
64 |
+
"confidence": probs[0][pred].item(),
|
65 |
+
"probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
|
66 |
+
})
|
67 |
+
# Token level
|
68 |
+
token_results = self.get_token_predictions(text)
|
69 |
+
return {
|
70 |
+
"document": doc_result,
|
71 |
+
"sentences": sent_results,
|
72 |
+
"tokens": token_results
|
73 |
+
}
|