Update handler.py

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -108,14 +108,14 @@ class EndpointHandler:
                     raise e3
     def _load_model(self, model_path: str, base_path: str):
-        """Load model with fallback methods"""
         try:
-            # Try direct loading from model path
-            logger.info(f"Trying to load model from {model_path}")
             model = AutoModelForCausalLM.from_pretrained(
                 model_path,
-                torch_dtype=torch.float16,
                 device_map="auto",
                 trust_remote_code=True,
                 local_files_only=True,
@@ -125,11 +125,11 @@ class EndpointHandler:
             logger.warning(f"Failed to load from {model_path}: {e1}")
             try:
-                # Try loading from base path
-                logger.info(f"Trying to load model from {base_path}")
                 model = AutoModelForCausalLM.from_pretrained(
                     base_path,
-                    torch_dtype=torch.float16,
                     device_map="auto",
                     trust_remote_code=True,
                     local_files_only=True,

                     raise e3
     def _load_model(self, model_path: str, base_path: str):
+        """Load model with 8-bit quantization to fit memory limits"""
         try:
+            # Try direct loading from model path with 8-bit quantization
+            logger.info(f"Trying to load model from {model_path} with 8-bit quantization")
             model = AutoModelForCausalLM.from_pretrained(
                 model_path,
+                load_in_8bit=True,  # Use 8-bit quantization
                 device_map="auto",
                 trust_remote_code=True,
                 local_files_only=True,
             logger.warning(f"Failed to load from {model_path}: {e1}")
             try:
+                # Try loading from base path with 8-bit quantization
+                logger.info(f"Trying to load model from {base_path} with 8-bit quantization")
                 model = AutoModelForCausalLM.from_pretrained(
                     base_path,
+                    load_in_8bit=True,  # Use 8-bit quantization
                     device_map="auto",
                     trust_remote_code=True,
                     local_files_only=True,