Update handler.py

Browse files

Files changed (1) hide show

handler.py +61 -27

handler.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from typing import Dict, List, Any
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import logging
 # Set up logging
@@ -11,43 +13,79 @@ class EndpointHandler:
     def __init__(self, path: str = ""):
         """
         Initialize the handler for Qwen2.5-Coder-7B-Instruct-Omni1.1
-        Simple and robust implementation
         """
         logger.info(f"Loading model from {path}")
         try:
-            # Load tokenizer - most robust approach
-            self.tokenizer = AutoTokenizer.from_pretrained(
                 path,
                 trust_remote_code=True,
-                use_fast=False,
                 padding_side="left"
             )
-            # Ensure we have proper tokens
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            if self.tokenizer.chat_template is None:
-                # Set a basic chat template for Qwen
-                self.tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}<|im_start|>user\n{{ message['content'] }}<|im_end|>\n{% elif message['role'] == 'assistant' %}<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-            # Load model
-            self.model = AutoModelForCausalLM.from_pretrained(
                 path,
                 torch_dtype=torch.float16,
                 device_map="auto",
                 trust_remote_code=True,
-                low_cpu_mem_usage=True,
-                use_cache=True
             )
             self.model.eval()
-            logger.info("Model and tokenizer loaded successfully")
         except Exception as e:
-            logger.error(f"Error loading model: {str(e)}")
-            raise e
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
@@ -62,18 +100,16 @@ class EndpointHandler:
                 return [{"error": "No input provided", "generated_text": ""}]
             # Generation parameters
-            max_new_tokens = parameters.get("max_new_tokens", 512)
-            temperature = parameters.get("temperature", 0.7)
-            top_p = parameters.get("top_p", 0.9)
             do_sample = parameters.get("do_sample", True)
-            repetition_penalty = parameters.get("repetition_penalty", 1.1)
-            # Prepare input - handle both raw text and pre-formatted chat
             if inputs.startswith("<|im_start|>"):
-                # Already formatted
                 formatted_input = inputs
             else:
-                # Format as chat
                 formatted_input = f"<|im_start|>user\n{inputs}<|im_end|>\n<|im_start|>assistant\n"
             # Tokenize
@@ -82,7 +118,7 @@ class EndpointHandler:
                 return_tensors="pt",
                 add_special_tokens=False,
                 truncation=True,
-                max_length=4096 - max_new_tokens  # Leave room for generation
             )
             if input_ids.size(1) == 0:
@@ -99,7 +135,7 @@ class EndpointHandler:
                     top_p=top_p,
                     do_sample=do_sample,
                     repetition_penalty=repetition_penalty,
-                    pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                     use_cache=True,
                     num_return_sequences=1
@@ -115,8 +151,6 @@ class EndpointHandler:
             # Clean up response
             response = response.strip()
-            # Remove any remaining special tokens manually
             response = response.replace("<|im_end|>", "").strip()
             return [{

 from typing import Dict, List, Any
 import torch
+import json
+import os
+from transformers import Qwen2TokenizerFast, Qwen2ForCausalLM
 import logging
 # Set up logging
     def __init__(self, path: str = ""):
         """
         Initialize the handler for Qwen2.5-Coder-7B-Instruct-Omni1.1
+        Explicitly using Qwen2 classes to bypass auto-detection
         """
         logger.info(f"Loading model from {path}")
         try:
+            # Check if config exists and log it
+            config_path = os.path.join(path, "config.json")
+            if os.path.exists(config_path):
+                with open(config_path, 'r') as f:
+                    config = json.load(f)
+                logger.info(f"Found config with model_type: {config.get('model_type', 'MISSING')}")
+            else:
+                logger.warning("No config.json found in repository")
+            # Load tokenizer explicitly as Qwen2
+            logger.info("Loading tokenizer as Qwen2TokenizerFast...")
+            self.tokenizer = Qwen2TokenizerFast.from_pretrained(
                 path,
                 trust_remote_code=True,
                 padding_side="left"
             )
+            # Ensure proper tokens
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+            logger.info("Tokenizer loaded successfully")
+            # Load model explicitly as Qwen2ForCausalLM
+            logger.info("Loading model as Qwen2ForCausalLM...")
+            self.model = Qwen2ForCausalLM.from_pretrained(
                 path,
                 torch_dtype=torch.float16,
                 device_map="auto",
                 trust_remote_code=True,
+                low_cpu_mem_usage=True
             )
             self.model.eval()
+            logger.info("Model loaded successfully")
         except Exception as e:
+            logger.error(f"Error during initialization: {str(e)}")
+            # Try alternative loading method
+            try:
+                logger.info("Attempting alternative loading method...")
+                # Use the models subdirectory path that we saw in your repo
+                model_path = os.path.join(path, "models", "huggingface") if os.path.exists(os.path.join(path, "models", "huggingface")) else path
+                self.tokenizer = Qwen2TokenizerFast.from_pretrained(
+                    model_path,
+                    trust_remote_code=True,
+                    local_files_only=True
+                )
+                if self.tokenizer.pad_token is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+                self.model = Qwen2ForCausalLM.from_pretrained(
+                    model_path,
+                    torch_dtype=torch.float16,
+                    device_map="auto",
+                    trust_remote_code=True,
+                    local_files_only=True
+                )
+                self.model.eval()
+                logger.info("Alternative loading successful")
+            except Exception as e2:
+                logger.error(f"Alternative loading also failed: {str(e2)}")
+                raise e2
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
                 return [{"error": "No input provided", "generated_text": ""}]
             # Generation parameters
+            max_new_tokens = min(parameters.get("max_new_tokens", 512), 1024)  # Cap at 1024
+            temperature = max(0.1, min(parameters.get("temperature", 0.7), 2.0))  # Clamp between 0.1 and 2.0
+            top_p = max(0.1, min(parameters.get("top_p", 0.9), 1.0))  # Clamp between 0.1 and 1.0
             do_sample = parameters.get("do_sample", True)
+            repetition_penalty = max(1.0, min(parameters.get("repetition_penalty", 1.1), 2.0))
+            # Format input with Qwen chat template
             if inputs.startswith("<|im_start|>"):
                 formatted_input = inputs
             else:
                 formatted_input = f"<|im_start|>user\n{inputs}<|im_end|>\n<|im_start|>assistant\n"
             # Tokenize
                 return_tensors="pt",
                 add_special_tokens=False,
                 truncation=True,
+                max_length=3072  # Leave room for generation
             )
             if input_ids.size(1) == 0:
                     top_p=top_p,
                     do_sample=do_sample,
                     repetition_penalty=repetition_penalty,
+                    pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                     use_cache=True,
                     num_return_sequences=1
             # Clean up response
             response = response.strip()
             response = response.replace("<|im_end|>", "").strip()
             return [{