Update handler.py

Browse files

Files changed (1) hide show

handler.py +29 -20

handler.py CHANGED Viewed

@@ -121,10 +121,18 @@ class EndpointHandler:
         return tokenizer
     def _load_model_manual(self, model_path: str):
-        """Load model completely manually"""
         logger.info("Loading model manually...")
         # Load config manually
         config_path = os.path.join(model_path, "config.json")
         with open(config_path, 'r') as f:
@@ -145,8 +153,12 @@ class EndpointHandler:
         # Create model
         model = Qwen2ForCausalLM(config)
-        # Load weights manually from safetensors
         import glob
         safetensors_files = glob.glob(os.path.join(model_path, "*.safetensors"))
         logger.info(f"Found {len(safetensors_files)} safetensors files")
@@ -154,15 +166,20 @@ class EndpointHandler:
         if safetensors_files:
             from safetensors.torch import load_file
-            # Load weights in chunks
-            state_dict = {}
             for i, file in enumerate(sorted(safetensors_files)):
                 logger.info(f"Loading weights from file {i+1}/{len(safetensors_files)}: {os.path.basename(file)}")
                 partial_state_dict = load_file(file)
-                state_dict.update(partial_state_dict)
-                # Clear partial dict to free memory
                 del partial_state_dict
                 # Force garbage collection
@@ -171,27 +188,19 @@ class EndpointHandler:
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
-            # Load weights into model
-            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-            if missing_keys:
-                logger.warning(f"Missing keys: {len(missing_keys)} keys missing")
-            if unexpected_keys:
-                logger.warning(f"Unexpected keys: {len(unexpected_keys)} unexpected keys")
-            # Clear state dict to free memory
-            del state_dict
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
         # Convert to half precision and move to GPU
         model = model.half()
         if torch.cuda.is_available():
             model = model.cuda()
         model.eval()
         return model
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:

         return tokenizer
     def _load_model_manual(self, model_path: str):
+        """Load model completely manually with memory optimization"""
         logger.info("Loading model manually...")
+        # Check GPU availability and memory
+        if torch.cuda.is_available():
+            logger.info(f"CUDA available: {torch.cuda.get_device_name()}")
+            logger.info(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
+            logger.info(f"GPU memory total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f}GB")
+        else:
+            logger.warning("CUDA not available, using CPU")
         # Load config manually
         config_path = os.path.join(model_path, "config.json")
         with open(config_path, 'r') as f:
         # Create model
         model = Qwen2ForCausalLM(config)
+        logger.info("Model architecture created")
+        if torch.cuda.is_available():
+            logger.info(f"GPU memory after model creation: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
+        # Load weights manually from safetensors with memory optimization
         import glob
         safetensors_files = glob.glob(os.path.join(model_path, "*.safetensors"))
         logger.info(f"Found {len(safetensors_files)} safetensors files")
         if safetensors_files:
             from safetensors.torch import load_file
+            # Load weights directly into model without accumulating in state_dict
             for i, file in enumerate(sorted(safetensors_files)):
                 logger.info(f"Loading weights from file {i+1}/{len(safetensors_files)}: {os.path.basename(file)}")
+                # Load partial weights
                 partial_state_dict = load_file(file)
+                if torch.cuda.is_available():
+                    logger.info(f"GPU memory after loading file {i+1}: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
+                # Load this partial state dict directly into the model
+                missing_keys, unexpected_keys = model.load_state_dict(partial_state_dict, strict=False)
+                # Clear partial dict immediately to free memory
                 del partial_state_dict
                 # Force garbage collection
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
+                    logger.info(f"GPU memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
         # Convert to half precision and move to GPU
+        logger.info("Converting model to half precision...")
         model = model.half()
         if torch.cuda.is_available():
+            logger.info(f"GPU memory after half precision: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
             model = model.cuda()
+            logger.info(f"GPU memory after moving to GPU: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
         model.eval()
+        logger.info("Model loaded successfully and set to eval mode")
         return model
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: