Update README.md
Browse files
README.md
CHANGED
|
@@ -108,6 +108,7 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 108 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
| 109 |
from llmcompressor.transformers import oneshot
|
| 110 |
from llmcompressor.transformers.tracing import TraceableMistral3ForConditionalGeneration
|
|
|
|
| 111 |
from PIL import Image
|
| 112 |
import io
|
| 113 |
|
|
@@ -116,7 +117,7 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 116 |
model_name = model_stub.split("/")[-1]
|
| 117 |
|
| 118 |
num_text_samples = 1024
|
| 119 |
-
|
| 120 |
max_seq_len = 8192
|
| 121 |
|
| 122 |
processor = AutoProcessor.from_pretrained(model_stub)
|
|
@@ -134,13 +135,14 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 134 |
example["messages"],
|
| 135 |
add_generation_prompt=False,
|
| 136 |
),
|
| 137 |
-
"images"
|
| 138 |
}
|
| 139 |
tokenized_input = processor(**input, max_length=max_seq_len, truncation=True)
|
| 140 |
tokenized_input["pixel_values"] = tokenized_input.get("pixel_values", None)
|
| 141 |
tokenized_input["image_sizes"] = tokenized_input.get("image_sizes", None)
|
|
|
|
| 142 |
|
| 143 |
-
dst = load_dataset("neuralmagic/calibration", name="LLM", split="train").select(num_text_samples)
|
| 144 |
dst = dst.map(preprocess_text, remove_columns=dst.column_names)
|
| 145 |
|
| 146 |
# Text + vision data subset
|
|
@@ -149,11 +151,11 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 149 |
image = None
|
| 150 |
for message in example["messages"]:
|
| 151 |
message_content = []
|
| 152 |
-
for content in message["content"]
|
| 153 |
if content["type"] == "text":
|
| 154 |
-
message_content
|
| 155 |
else:
|
| 156 |
-
message_content
|
| 157 |
image = Image.open(io.BytesIO(content["image"]))
|
| 158 |
|
| 159 |
messages.append(
|
|
@@ -168,13 +170,14 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 168 |
messages,
|
| 169 |
add_generation_prompt=False,
|
| 170 |
),
|
| 171 |
-
"images"
|
| 172 |
}
|
| 173 |
tokenized_input = processor(**input, max_length=max_seq_len, truncation=True)
|
| 174 |
tokenized_input["pixel_values"] = tokenized_input.get("pixel_values", None)
|
| 175 |
tokenized_input["image_sizes"] = tokenized_input.get("image_sizes", None)
|
|
|
|
| 176 |
|
| 177 |
-
dsv = load_dataset("neuralmagic/calibration", name="
|
| 178 |
dsv = dsv.map(preprocess_vision, remove_columns=dsv.column_names)
|
| 179 |
|
| 180 |
# Interleave subsets
|
|
@@ -182,9 +185,9 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 182 |
|
| 183 |
# Configure the quantization algorithm and scheme
|
| 184 |
recipe = GPTQModifier(
|
| 185 |
-
ignore=["language_model.lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"]
|
| 186 |
-
sequential_targets=["MistralDecoderLayer"]
|
| 187 |
-
dampening_frac=0.01
|
| 188 |
targets="Linear",
|
| 189 |
scheme="W4A16",
|
| 190 |
)
|
|
@@ -213,12 +216,13 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 213 |
recipe=recipe,
|
| 214 |
max_seq_length=max_seq_len,
|
| 215 |
data_collator=data_collator,
|
|
|
|
| 216 |
)
|
| 217 |
|
| 218 |
# Save to disk in compressed-tensors format
|
| 219 |
-
save_path = model_name + "-quantized.w4a16
|
| 220 |
model.save_pretrained(save_path)
|
| 221 |
-
|
| 222 |
print(f"Model and tokenizer saved to: {save_path}")
|
| 223 |
```
|
| 224 |
</details>
|
|
|
|
| 108 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
| 109 |
from llmcompressor.transformers import oneshot
|
| 110 |
from llmcompressor.transformers.tracing import TraceableMistral3ForConditionalGeneration
|
| 111 |
+
from datasets import load_dataset, interleave_datasets
|
| 112 |
from PIL import Image
|
| 113 |
import io
|
| 114 |
|
|
|
|
| 117 |
model_name = model_stub.split("/")[-1]
|
| 118 |
|
| 119 |
num_text_samples = 1024
|
| 120 |
+
num_vision_samples = 1024
|
| 121 |
max_seq_len = 8192
|
| 122 |
|
| 123 |
processor = AutoProcessor.from_pretrained(model_stub)
|
|
|
|
| 135 |
example["messages"],
|
| 136 |
add_generation_prompt=False,
|
| 137 |
),
|
| 138 |
+
"images": None,
|
| 139 |
}
|
| 140 |
tokenized_input = processor(**input, max_length=max_seq_len, truncation=True)
|
| 141 |
tokenized_input["pixel_values"] = tokenized_input.get("pixel_values", None)
|
| 142 |
tokenized_input["image_sizes"] = tokenized_input.get("image_sizes", None)
|
| 143 |
+
return tokenized_input
|
| 144 |
|
| 145 |
+
dst = load_dataset("neuralmagic/calibration", name="LLM", split="train").select(range(num_text_samples))
|
| 146 |
dst = dst.map(preprocess_text, remove_columns=dst.column_names)
|
| 147 |
|
| 148 |
# Text + vision data subset
|
|
|
|
| 151 |
image = None
|
| 152 |
for message in example["messages"]:
|
| 153 |
message_content = []
|
| 154 |
+
for content in message["content"]:
|
| 155 |
if content["type"] == "text":
|
| 156 |
+
message_content.append({"type": "text", "text": content["text"]})
|
| 157 |
else:
|
| 158 |
+
message_content.append({"type": "image"})
|
| 159 |
image = Image.open(io.BytesIO(content["image"]))
|
| 160 |
|
| 161 |
messages.append(
|
|
|
|
| 170 |
messages,
|
| 171 |
add_generation_prompt=False,
|
| 172 |
),
|
| 173 |
+
"images": image,
|
| 174 |
}
|
| 175 |
tokenized_input = processor(**input, max_length=max_seq_len, truncation=True)
|
| 176 |
tokenized_input["pixel_values"] = tokenized_input.get("pixel_values", None)
|
| 177 |
tokenized_input["image_sizes"] = tokenized_input.get("image_sizes", None)
|
| 178 |
+
return tokenized_input
|
| 179 |
|
| 180 |
+
dsv = load_dataset("neuralmagic/calibration", name="VLM", split="train").select(range(num_vision_samples))
|
| 181 |
dsv = dsv.map(preprocess_vision, remove_columns=dsv.column_names)
|
| 182 |
|
| 183 |
# Interleave subsets
|
|
|
|
| 185 |
|
| 186 |
# Configure the quantization algorithm and scheme
|
| 187 |
recipe = GPTQModifier(
|
| 188 |
+
ignore=["language_model.lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
|
| 189 |
+
sequential_targets=["MistralDecoderLayer"],
|
| 190 |
+
dampening_frac=0.01,
|
| 191 |
targets="Linear",
|
| 192 |
scheme="W4A16",
|
| 193 |
)
|
|
|
|
| 216 |
recipe=recipe,
|
| 217 |
max_seq_length=max_seq_len,
|
| 218 |
data_collator=data_collator,
|
| 219 |
+
num_samples=num_text_samples + num_vision_samples,
|
| 220 |
)
|
| 221 |
|
| 222 |
# Save to disk in compressed-tensors format
|
| 223 |
+
save_path = model_name + "-quantized.w4a16"
|
| 224 |
model.save_pretrained(save_path)
|
| 225 |
+
processor.save_pretrained(save_path)
|
| 226 |
print(f"Model and tokenizer saved to: {save_path}")
|
| 227 |
```
|
| 228 |
</details>
|