File size: 5,121 Bytes
bd78f29 e4b8175 bd78f29 e4b8175 bd78f29 e4b8175 bd78f29 e4b8175 bd78f29 e4b8175 bd78f29 58ed14d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
---
license: mit
base_model:
- GSAI-ML/LLaDA-8B-Instruct
pipeline_tag: text-generation
---
Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got!
Used modal to do so. If you want to replicate what I did, try this:
First, install modal and log into the CLI.
uv add modal
uv run modal login
Then, add an environment and a volume to the project.
uv run modal volume create quantized-model-output
Then, run the quantization script:
uv run modal run scripts/quantize_llada.py
```
# scripts/quantize_llada.py
import modal
image = (
modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13")
.apt_install("git", "curl")
.pip_install(
"torch>=2.7.0",
"torchvision",
"torchaudio",
index_url="https://download.pytorch.org/whl/cu128",
)
.pip_install(
"numpy",
"accelerate",
"optimum",
"loguru",
"transformers",
)
.pip_install("triton")
.pip_install("gptqmodel")
)
# These need to be created before running the app:
# uv run modal volume create quantized-model-output
output_volume = modal.Volume.from_name("quantized-model-output")
volume_config = {"/quantized-model-output": output_volume}
app = modal.App("quantize-llada", image=image, volumes=volume_config)
TRAIN_GPU_COUNT = 1
TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}"
TRAIN_CPU_COUNT = 4
MINUTES = 40
@app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60)
def quantize_model() -> None:
import types
import torch
from loguru import logger
from optimum.gptq import GPTQQuantizer
from transformers import AutoModel, AutoTokenizer
output_volume.reload()
# Check if CUDA is available, show device count
logger.info(f"CUDA available: {torch.cuda.is_available()}")
logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
logger.info(f"CUDA version: {torch.version.cuda}")
logger.info(f"PyTorch version: {torch.__version__}")
# Check if GPU is available
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available")
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)
# We need to do some shenanigans.
# First, load the model on the CPU so we can patch the forward pass.
logger.info("Loading model on CPU...")
model = AutoModel.from_pretrained(
"GSAI-ML/LLaDA-8B-Instruct",
trust_remote_code=True,
device_map="cpu",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
)
logger.info("Patching model forward pass...")
def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs):
"""
Patched forward that handles both positional and keyword arguments for attention_bias
"""
# If attention_bias was passed as positional argument, use it
if len(args) > 0 and attention_bias is None:
attention_bias = args[0]
args = args[1:]
if len(args) > 0 and layer_past is None:
layer_past = args[0]
args = args[1:]
if len(args) > 0:
use_cache = args[0]
# Call the original forward with cleaned arguments
return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
# Apply patch to all LLaDALlamaBlock instances
for name, module in model.named_modules():
if module.__class__.__name__ == "LLaDALlamaBlock":
# Store original forward
module._original_forward = module.forward
# Replace with patched version
module.forward = types.MethodType(patched_forward, module)
logger.info(f"Patched {name}")
# Move model to GPU
logger.info("Moving model to GPU...")
model = model.to("cuda")
logger.info("Setting up GPTQ quantizer...")
quantizer = GPTQQuantizer(
bits=4,
group_size=128,
desc_act=False,
sym=True,
true_sequential=True,
dataset="c4",
tokenizer=tokenizer,
block_name_to_quantize="model.transformer.blocks",
)
logger.info("Quantizing model...")
quantizer.quantize_model(model, tokenizer)
logger.info("Quantization done, saving model...")
output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq"
logger.info(f"Saving model to {output_path}")
quantizer.save(model, output_path)
tokenizer.save_pretrained(output_path)
logger.success("Quantization done, model saved!")
@app.local_entrypoint()
def main():
quantize_model.remote()
```
I think I also had to manually change something in the config.json -- needed to be "block_name_to_quantize": "model.transformer.blocks" instead of whatever it was, maybe "model_block_name_to_quantize" as the key or something. |