Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got!
Used modal to do so. If you want to replicate what I did, try this:
First, install modal and log into the CLI.
uv add modal uv run modal login
Then, add an environment and a volume to the project.
uv run modal volume create quantized-model-output
Then, run the quantization script:
uv run modal run scripts/quantize_llada.py
# scripts/quantize_llada.py
import modal
image = (
modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13")
.apt_install("git", "curl")
.pip_install(
"torch>=2.7.0",
"torchvision",
"torchaudio",
index_url="https://download.pytorch.org/whl/cu128",
)
.pip_install(
"numpy",
"accelerate",
"optimum",
"loguru",
"transformers",
)
.pip_install("triton")
.pip_install("gptqmodel")
)
# These need to be created before running the app:
# uv run modal volume create quantized-model-output
output_volume = modal.Volume.from_name("quantized-model-output")
volume_config = {"/quantized-model-output": output_volume}
app = modal.App("quantize-llada", image=image, volumes=volume_config)
TRAIN_GPU_COUNT = 1
TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}"
TRAIN_CPU_COUNT = 4
MINUTES = 40
@app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60)
def quantize_model() -> None:
import types
import torch
from loguru import logger
from optimum.gptq import GPTQQuantizer
from transformers import AutoModel, AutoTokenizer
output_volume.reload()
# Check if CUDA is available, show device count
logger.info(f"CUDA available: {torch.cuda.is_available()}")
logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
logger.info(f"CUDA version: {torch.version.cuda}")
logger.info(f"PyTorch version: {torch.__version__}")
# Check if GPU is available
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available")
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)
# We need to do some shenanigans.
# First, load the model on the CPU so we can patch the forward pass.
logger.info("Loading model on CPU...")
model = AutoModel.from_pretrained(
"GSAI-ML/LLaDA-8B-Instruct",
trust_remote_code=True,
device_map="cpu",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
)
logger.info("Patching model forward pass...")
def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs):
"""
Patched forward that handles both positional and keyword arguments for attention_bias
"""
# If attention_bias was passed as positional argument, use it
if len(args) > 0 and attention_bias is None:
attention_bias = args[0]
args = args[1:]
if len(args) > 0 and layer_past is None:
layer_past = args[0]
args = args[1:]
if len(args) > 0:
use_cache = args[0]
# Call the original forward with cleaned arguments
return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
# Apply patch to all LLaDALlamaBlock instances
for name, module in model.named_modules():
if module.__class__.__name__ == "LLaDALlamaBlock":
# Store original forward
module._original_forward = module.forward
# Replace with patched version
module.forward = types.MethodType(patched_forward, module)
logger.info(f"Patched {name}")
# Move model to GPU
logger.info("Moving model to GPU...")
model = model.to("cuda")
logger.info("Setting up GPTQ quantizer...")
quantizer = GPTQQuantizer(
bits=4,
group_size=128,
desc_act=False,
sym=True,
true_sequential=True,
dataset="c4",
tokenizer=tokenizer,
block_name_to_quantize="model.transformer.blocks",
)
logger.info("Quantizing model...")
quantizer.quantize_model(model, tokenizer)
logger.info("Quantization done, saving model...")
output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq"
logger.info(f"Saving model to {output_path}")
quantizer.save(model, output_path)
tokenizer.save_pretrained(output_path)
logger.success("Quantization done, model saved!")
@app.local_entrypoint()
def main():
quantize_model.remote()
I think I also had to manually change something in the config.json -- needed to be "block_name_to_quantize": "model.transformer.blocks" instead of whatever it was, maybe "model_block_name_to_quantize" as the key or something.
- Downloads last month
- 3
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support
Model tree for mrdmnd/llada-8b-instruct-4bit-gptq
Base model
GSAI-ML/LLaDA-8B-Instruct