Update handler.py

Browse files

Files changed (1) hide show

handler.py +154 -230

handler.py CHANGED Viewed

@@ -3,9 +3,7 @@ import torch
 import json
 import os
 import glob
-import tempfile
-from transformers import PreTrainedTokenizerFast, PreTrainedModel
-from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 import logging
 # Set up logging
@@ -15,228 +13,131 @@ logger = logging.getLogger(__name__)
 class EndpointHandler:
     def __init__(self, path: str = ""):
         """
-        Manual model loading that completely bypasses auto-detection
         """
         logger.info(f"Loading model from {path}")
         try:
-            # Set cache directories to temp to avoid memory issues
-            os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
-            os.environ['HF_HOME'] = '/tmp/hf_home'
-            os.environ['TOKENIZERS_PARALLELISM'] = 'false'
-            # Clear any existing cache
-            self._clear_cache()
-            # Find the actual model files
-            model_path = self._discover_model_files(path)
-            logger.info(f"Model files found at: {model_path}")
-            # Load config manually
-            config = self._load_config_manually(model_path)
-            logger.info(f"Config loaded: {config.model_type}")
-            # Load tokenizer manually
-            self.tokenizer = self._load_tokenizer_manually(model_path)
             logger.info("Tokenizer loaded successfully")
-            # Create model architecture manually
-            self.model = self._create_model_manually(config, model_path)
             logger.info("Model loaded successfully")
         except Exception as e:
             logger.error(f"Failed to initialize: {str(e)}")
             raise e
-    def _clear_cache(self):
-        """Clear any cached model data to free memory"""
-        try:
-            import shutil
-            cache_dirs = ['/tmp/transformers_cache', '/tmp/hf_home']
-            for cache_dir in cache_dirs:
-                if os.path.exists(cache_dir):
-                    shutil.rmtree(cache_dir)
-                    logger.info(f"Cleared cache: {cache_dir}")
-        except Exception as e:
-            logger.warning(f"Could not clear cache: {e}")
-    def _discover_model_files(self, base_path: str) -> str:
-        """Find where the actual model files are located"""
-        logger.info(f"Searching for model files in: {base_path}")
-        # List all contents
-        if os.path.exists(base_path):
-            contents = os.listdir(base_path)
-            logger.info(f"Base directory contents: {contents}")
-            # Check for config.json in base path
-            if "config.json" in contents:
-                logger.info("Found config.json in base directory")
-                return base_path
-            # Check models subdirectories
-            for item in contents:
-                if os.path.isdir(os.path.join(base_path, item)):
-                    sub_path = os.path.join(base_path, item)
-                    sub_contents = os.listdir(sub_path)
-                    logger.info(f"Subdirectory {item}: {sub_contents}")
-                    if "config.json" in sub_contents:
-                        logger.info(f"Found config.json in {item} subdirectory")
-                        return sub_path
-            # Search recursively
-            for root, dirs, files in os.walk(base_path):
-                if "config.json" in files:
-                    logger.info(f"Found config.json in {root}")
-                    return root
-        raise FileNotFoundError(f"No config.json found in {base_path} or subdirectories")
-    def _load_config_manually(self, model_path: str) -> Qwen2Config:
-        """Load and create config manually"""
-        config_path = os.path.join(model_path, "config.json")
-        logger.info(f"Loading config from: {config_path}")
-        with open(config_path, 'r') as f:
-            config_dict = json.load(f)
-        logger.info(f"Config keys: {list(config_dict.keys())}")
-        logger.info(f"Model type: {config_dict.get('model_type', 'NOT_FOUND')}")
-        # Ensure model_type is set correctly
-        if 'model_type' not in config_dict:
-            config_dict['model_type'] = 'qwen2'
-            logger.info("Set model_type to 'qwen2'")
-        # Create config object
-        config = Qwen2Config(**config_dict)
-        return config
-    def _load_tokenizer_manually(self, model_path: str) -> PreTrainedTokenizerFast:
-        """Load tokenizer without auto-detection"""
-        # Look for tokenizer files
-        tokenizer_files = []
-        for file in os.listdir(model_path):
-            if file in ['tokenizer.json', 'tokenizer_config.json', 'vocab.json']:
-                tokenizer_files.append(file)
-        logger.info(f"Found tokenizer files: {tokenizer_files}")
-        if 'tokenizer.json' in tokenizer_files:
-            # Load from tokenizer.json
-            tokenizer_path = os.path.join(model_path, 'tokenizer.json')
-            logger.info(f"Loading tokenizer from {tokenizer_path}")
-            tokenizer = PreTrainedTokenizerFast(
-                tokenizer_file=tokenizer_path,
-                unk_token="<|endoftext|>",
-                bos_token="<|endoftext|>",
-                eos_token="<|endoftext|>"
             )
-        else:
-            # Fallback: create basic tokenizer
-            logger.warning("No tokenizer.json found, creating basic tokenizer")
-            from transformers import AutoTokenizer
-            # Try to load from the model path with local_files_only
             try:
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_path,
                     trust_remote_code=True,
-                    local_files_only=True,
-                    cache_dir='/tmp/tokenizer_cache'  # Use temp cache
                 )
-            except Exception as e:
-                logger.error(f"Failed to load tokenizer: {e}")
-                raise e
-        # Set special tokens
-        if not hasattr(tokenizer, 'pad_token') or tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            tokenizer.pad_token_id = tokenizer.eos_token_id
-        return tokenizer
-    def _create_model_manually(self, config: Qwen2Config, model_path: str) -> Qwen2ForCausalLM:
-        """Create model architecture and load weights manually"""
-        logger.info("Creating Qwen2ForCausalLM with config")
-        model = Qwen2ForCausalLM(config)
-        # Find safetensors files
-        safetensors_files = glob.glob(os.path.join(model_path, "*.safetensors"))
-        logger.info(f"Found {len(safetensors_files)} safetensors files")
-        if not safetensors_files:
-            raise FileNotFoundError("No safetensors files found")
-        # Load weights manually with memory optimization
-        from safetensors.torch import load_file
-        # Convert to half precision before loading weights to save memory
-        model = model.half()
-        logger.info("Converted model to half precision")
-        # Load weights in chunks to avoid memory spikes
-        state_dict = {}
-        total_files = len(safetensors_files)
-        for i, file in enumerate(sorted(safetensors_files)):
-            logger.info(f"Loading weights from file {i+1}/{total_files}: {os.path.basename(file)}")
             try:
-                # Load partial weights
-                partial_state_dict = load_file(file)
-                # Convert to half precision immediately
-                partial_state_dict = {k: v.half() for k, v in partial_state_dict.items()}
-                # Update state dict
-                state_dict.update(partial_state_dict)
-                # Clear partial dict to free memory
-                del partial_state_dict
-                # Force garbage collection
-                import gc
-                gc.collect()
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                logger.info(f"Loaded file {i+1}/{total_files}, current memory usage: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
-            except Exception as e:
-                logger.error(f"Failed to load file {file}: {e}")
-                raise e
-        logger.info(f"Total state dict keys: {len(state_dict)}")
-        # Load weights into model
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-        if missing_keys:
-            logger.warning(f"Missing keys: {len(missing_keys)} keys missing")
-            logger.warning(f"First few missing: {missing_keys[:5]}")
-        if unexpected_keys:
-            logger.warning(f"Unexpected keys: {len(unexpected_keys)} unexpected keys")
-            logger.warning(f"First few unexpected: {unexpected_keys[:5]}")
-        # Clear state dict to free memory
-        del state_dict
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        # Move to GPU if available
-        if torch.cuda.is_available():
-            model = model.cuda()
-            logger.info(f"Model moved to GPU, final memory usage: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
         model.eval()
         return model
@@ -252,7 +153,7 @@ class EndpointHandler:
             if not inputs:
                 return [{"error": "No input provided", "generated_text": ""}]
-            # Generation parameters
             max_new_tokens = min(parameters.get("max_new_tokens", 512), 1024)
             temperature = max(0.1, min(parameters.get("temperature", 0.7), 2.0))
             top_p = max(0.1, min(parameters.get("top_p", 0.9), 1.0))
@@ -264,40 +165,63 @@ class EndpointHandler:
             else:
                 formatted_input = f"<|im_start|>user\n{inputs}<|im_end|>\n<|im_start|>assistant\n"
-            # Tokenize
-            input_ids = self.tokenizer.encode(
-                formatted_input,
-                return_tensors="pt",
-                truncation=True,
-                max_length=3072
-            )
-            input_ids = input_ids.to(self.model.device)
-            # Generate
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    input_ids,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=do_sample,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                    use_cache=True
-                )
-            # Decode response
-            generated_ids = outputs[0][input_ids.size(1):]
-            response = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-            response = response.replace("<|im_end|>", "").strip()
-            return [{
-                "generated_text": response,
-                "generated_tokens": len(generated_ids),
-                "finish_reason": "eos_token" if self.tokenizer.eos_token_id in generated_ids else "length"
-            }]
         except Exception as e:
-            logger.error(f"Generation error: {str(e)}")
-            return [{"error": f"Generation failed: {str(e)}", "generated_text": ""}]

 import json
 import os
 import glob
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import logging
 # Set up logging
 class EndpointHandler:
     def __init__(self, path: str = ""):
         """
+        Initialize handler with robust file discovery
         """
         logger.info(f"Loading model from {path}")
         try:
+            # Log directory contents to understand structure
+            if os.path.exists(path):
+                contents = os.listdir(path)
+                logger.info(f"Repository contents: {contents}")
+                # Look for model files in subdirectories
+                for item in contents:
+                    item_path = os.path.join(path, item)
+                    if os.path.isdir(item_path):
+                        sub_contents = os.listdir(item_path)
+                        logger.info(f"Directory {item}: {sub_contents}")
+            # Try to find the actual model path
+            model_path = self._find_model_path(path)
+            logger.info(f"Using model path: {model_path}")
+            # Load tokenizer - try multiple approaches
+            self.tokenizer = self._load_tokenizer(model_path, path)
             logger.info("Tokenizer loaded successfully")
+            # Load model
+            self.model = self._load_model(model_path, path)
             logger.info("Model loaded successfully")
         except Exception as e:
             logger.error(f"Failed to initialize: {str(e)}")
             raise e
+    def _find_model_path(self, base_path: str) -> str:
+        """Find the actual path containing model files"""
+        # Check if config.json is in base path
+        if os.path.exists(os.path.join(base_path, "config.json")):
+            return base_path
+        # Check models/huggingface subdirectory
+        hf_path = os.path.join(base_path, "models", "huggingface")
+        if os.path.exists(hf_path) and os.path.exists(os.path.join(hf_path, "config.json")):
+            return hf_path
+        # Check for any subdirectory with config.json
+        for root, dirs, files in os.walk(base_path):
+            if "config.json" in files:
+                return root
+        # Fallback to base path
+        return base_path
+    def _load_tokenizer(self, model_path: str, base_path: str):
+        """Load tokenizer with fallback methods"""
+        try:
+            # Try direct loading from model path
+            logger.info(f"Trying to load tokenizer from {model_path}")
+            return AutoTokenizer.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                local_files_only=True
             )
+        except Exception as e1:
+            logger.warning(f"Failed to load from {model_path}: {e1}")
             try:
+                # Try loading from base path
+                logger.info(f"Trying to load tokenizer from {base_path}")
+                return AutoTokenizer.from_pretrained(
+                    base_path,
                     trust_remote_code=True,
+                    local_files_only=True
                 )
+            except Exception as e2:
+                logger.warning(f"Failed to load from {base_path}: {e2}")
+                try:
+                    # Try loading from Hugging Face Hub as fallback
+                    logger.info("Using fallback tokenizer from Qwen2-7B-Instruct")
+                    tokenizer = AutoTokenizer.from_pretrained(
+                        "Qwen/Qwen2-7B-Instruct",
+                        trust_remote_code=True
+                    )
+                    # Set special tokens
+                    tokenizer.pad_token = tokenizer.eos_token
+                    return tokenizer
+                except Exception as e3:
+                    logger.error(f"All tokenizer loading methods failed: {e3}")
+                    raise e3
+    def _load_model(self, model_path: str, base_path: str):
+        """Load model with fallback methods"""
+        try:
+            # Try direct loading from model path
+            logger.info(f"Trying to load model from {model_path}")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True,
+                local_files_only=True,
+                low_cpu_mem_usage=True
+            )
+        except Exception as e1:
+            logger.warning(f"Failed to load from {model_path}: {e1}")
             try:
+                # Try loading from base path
+                logger.info(f"Trying to load model from {base_path}")
+                model = AutoModelForCausalLM.from_pretrained(
+                    base_path,
+                    torch_dtype=torch.float16,
+                    device_map="auto",
+                    trust_remote_code=True,
+                    local_files_only=True,
+                    low_cpu_mem_usage=True
+                )
+            except Exception as e2:
+                logger.error(f"Model loading failed from both paths: {e2}")
+                raise e2
         model.eval()
         return model
             if not inputs:
                 return [{"error": "No input provided", "generated_text": ""}]
+            # Generation parameters with safety limits
             max_new_tokens = min(parameters.get("max_new_tokens", 512), 1024)
             temperature = max(0.1, min(parameters.get("temperature", 0.7), 2.0))
             top_p = max(0.1, min(parameters.get("top_p", 0.9), 1.0))
             else:
                 formatted_input = f"<|im_start|>user\n{inputs}<|im_end|>\n<|im_start|>assistant\n"
+            # Tokenize with error handling
+            try:
+                input_ids = self.tokenizer.encode(
+                    formatted_input,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=3072
+                )
+            except Exception as e:
+                logger.error(f"Tokenization failed: {e}")
+                return [{"error": f"Tokenization failed: {str(e)}", "generated_text": ""}]
+            if input_ids.size(1) == 0:
+                return [{"error": "Empty input after tokenization", "generated_text": ""}]
+            # Move to model device
+            input_ids = input_ids.to(next(self.model.parameters()).device)
+            # Generate with error handling
+            try:
+                with torch.no_grad():
+                    outputs = self.model.generate(
+                        input_ids,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        do_sample=do_sample,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        use_cache=True,
+                        num_return_sequences=1
+                    )
+            except Exception as e:
+                logger.error(f"Generation failed: {e}")
+                return [{"error": f"Generation failed: {str(e)}", "generated_text": ""}]
+            # Decode response
+            try:
+                generated_ids = outputs[0][input_ids.size(1):]
+                response = self.tokenizer.decode(
+                    generated_ids,
+                    skip_special_tokens=True
+                ).strip()
+                # Clean up response
+                response = response.replace("<|im_end|>", "").strip()
+                return [{
+                    "generated_text": response,
+                    "generated_tokens": len(generated_ids),
+                    "finish_reason": "eos_token" if self.tokenizer.eos_token_id in generated_ids else "length"
+                }]
+            except Exception as e:
+                logger.error(f"Decoding failed: {e}")
+                return [{"error": f"Decoding failed: {str(e)}", "generated_text": ""}]
         except Exception as e:
+            logger.error(f"Inference error: {str(e)}")
+            return [{"error": f"Inference failed: {str(e)}", "generated_text": ""}]