Update handler.py
Browse files- handler.py +2 -0
handler.py
CHANGED
|
@@ -125,6 +125,7 @@ class EndpointHandler:
|
|
| 125 |
trust_remote_code=True,
|
| 126 |
device_map=device_map,
|
| 127 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
|
|
|
| 128 |
low_cpu_mem_usage=True,
|
| 129 |
offload_folder=offload_folder if device_map == "auto" else None,
|
| 130 |
max_memory=max_memory,
|
|
@@ -144,6 +145,7 @@ class EndpointHandler:
|
|
| 144 |
model_path,
|
| 145 |
trust_remote_code=True,
|
| 146 |
torch_dtype=torch.float32,
|
|
|
|
| 147 |
low_cpu_mem_usage=True,
|
| 148 |
)
|
| 149 |
logger.info("Successfully loaded with FP32 on CPU")
|
|
|
|
| 125 |
trust_remote_code=True,
|
| 126 |
device_map=device_map,
|
| 127 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 128 |
+
quantization_config=None, # Disable model's built-in quantization
|
| 129 |
low_cpu_mem_usage=True,
|
| 130 |
offload_folder=offload_folder if device_map == "auto" else None,
|
| 131 |
max_memory=max_memory,
|
|
|
|
| 145 |
model_path,
|
| 146 |
trust_remote_code=True,
|
| 147 |
torch_dtype=torch.float32,
|
| 148 |
+
quantization_config=None, # Disable model's built-in quantization
|
| 149 |
low_cpu_mem_usage=True,
|
| 150 |
)
|
| 151 |
logger.info("Successfully loaded with FP32 on CPU")
|