Update handler.py

Browse files

Files changed (1) hide show

handler.py +44 -7

handler.py CHANGED Viewed

@@ -174,14 +174,45 @@ class EndpointHandler:
         if not safetensors_files:
             raise FileNotFoundError("No safetensors files found")
-        # Load weights manually
         from safetensors.torch import load_file
         state_dict = {}
-        for file in sorted(safetensors_files):
-            logger.info(f"Loading weights from: {file}")
-            partial_state_dict = load_file(file)
-            state_dict.update(partial_state_dict)
         logger.info(f"Total state dict keys: {len(state_dict)}")
@@ -196,10 +227,16 @@ class EndpointHandler:
             logger.warning(f"Unexpected keys: {len(unexpected_keys)} unexpected keys")
             logger.warning(f"First few unexpected: {unexpected_keys[:5]}")
-        # Convert to half precision and move to GPU
-        model = model.half()
         if torch.cuda.is_available():
             model = model.cuda()
         model.eval()
         return model

         if not safetensors_files:
             raise FileNotFoundError("No safetensors files found")
+        # Load weights manually with memory optimization
         from safetensors.torch import load_file
+        # Convert to half precision before loading weights to save memory
+        model = model.half()
+        logger.info("Converted model to half precision")
+        # Load weights in chunks to avoid memory spikes
         state_dict = {}
+        total_files = len(safetensors_files)
+        for i, file in enumerate(sorted(safetensors_files)):
+            logger.info(f"Loading weights from file {i+1}/{total_files}: {os.path.basename(file)}")
+            try:
+                # Load partial weights
+                partial_state_dict = load_file(file)
+                # Convert to half precision immediately
+                partial_state_dict = {k: v.half() for k, v in partial_state_dict.items()}
+                # Update state dict
+                state_dict.update(partial_state_dict)
+                # Clear partial dict to free memory
+                del partial_state_dict
+                # Force garbage collection
+                import gc
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                logger.info(f"Loaded file {i+1}/{total_files}, current memory usage: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
+            except Exception as e:
+                logger.error(f"Failed to load file {file}: {e}")
+                raise e
         logger.info(f"Total state dict keys: {len(state_dict)}")
             logger.warning(f"Unexpected keys: {len(unexpected_keys)} unexpected keys")
             logger.warning(f"First few unexpected: {unexpected_keys[:5]}")
+        # Clear state dict to free memory
+        del state_dict
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Move to GPU if available
         if torch.cuda.is_available():
             model = model.cuda()
+            logger.info(f"Model moved to GPU, final memory usage: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
         model.eval()
         return model