Update handler.py
Browse files- handler.py +2 -0
handler.py
CHANGED
@@ -125,6 +125,7 @@ class EndpointHandler:
|
|
125 |
trust_remote_code=True,
|
126 |
device_map=device_map,
|
127 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
|
|
128 |
low_cpu_mem_usage=True,
|
129 |
offload_folder=offload_folder if device_map == "auto" else None,
|
130 |
max_memory=max_memory,
|
@@ -144,6 +145,7 @@ class EndpointHandler:
|
|
144 |
model_path,
|
145 |
trust_remote_code=True,
|
146 |
torch_dtype=torch.float32,
|
|
|
147 |
low_cpu_mem_usage=True,
|
148 |
)
|
149 |
logger.info("Successfully loaded with FP32 on CPU")
|
|
|
125 |
trust_remote_code=True,
|
126 |
device_map=device_map,
|
127 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
128 |
+
quantization_config=None, # Disable model's built-in quantization
|
129 |
low_cpu_mem_usage=True,
|
130 |
offload_folder=offload_folder if device_map == "auto" else None,
|
131 |
max_memory=max_memory,
|
|
|
145 |
model_path,
|
146 |
trust_remote_code=True,
|
147 |
torch_dtype=torch.float32,
|
148 |
+
quantization_config=None, # Disable model's built-in quantization
|
149 |
low_cpu_mem_usage=True,
|
150 |
)
|
151 |
logger.info("Successfully loaded with FP32 on CPU")
|