Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got!

Used modal to do so. If you want to replicate what I did, try this:

First, install modal and log into the CLI.

uv add modal uv run modal login

Then, add an environment and a volume to the project.

uv run modal volume create quantized-model-output

Then, run the quantization script:

uv run modal run scripts/quantize_llada.py

# scripts/quantize_llada.py

import modal

image = (
    modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13")
    .apt_install("git", "curl")
    .pip_install(
        "torch>=2.7.0",
        "torchvision",
        "torchaudio",
        index_url="https://download.pytorch.org/whl/cu128",
    )
    .pip_install(
        "numpy",
        "accelerate",
        "optimum",
        "loguru",
        "transformers",
    )
    .pip_install("triton")
    .pip_install("gptqmodel")
)

# These need to be created before running the app:
# uv run modal volume create quantized-model-output
output_volume = modal.Volume.from_name("quantized-model-output")
volume_config = {"/quantized-model-output": output_volume}

app = modal.App("quantize-llada", image=image, volumes=volume_config)

TRAIN_GPU_COUNT = 1
TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}"
TRAIN_CPU_COUNT = 4
MINUTES = 40


@app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60)
def quantize_model() -> None:
    import types

    import torch
    from loguru import logger
    from optimum.gptq import GPTQQuantizer
    from transformers import AutoModel, AutoTokenizer

    output_volume.reload()

    # Check if CUDA is available, show device count
    logger.info(f"CUDA available: {torch.cuda.is_available()}")
    logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
    logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
    logger.info(f"CUDA version: {torch.version.cuda}")
    logger.info(f"PyTorch version: {torch.__version__}")

    # Check if GPU is available
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available")

    logger.info("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)

    # We need to do some shenanigans.
    # First, load the model on the CPU so we can patch the forward pass.
    logger.info("Loading model on CPU...")
    model = AutoModel.from_pretrained(
        "GSAI-ML/LLaDA-8B-Instruct",
        trust_remote_code=True,
        device_map="cpu",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )
    logger.info("Patching model forward pass...")

    def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs):
        """
        Patched forward that handles both positional and keyword arguments for attention_bias
        """
        # If attention_bias was passed as positional argument, use it
        if len(args) > 0 and attention_bias is None:
            attention_bias = args[0]
            args = args[1:]
        if len(args) > 0 and layer_past is None:
            layer_past = args[0]
            args = args[1:]
        if len(args) > 0:
            use_cache = args[0]

        # Call the original forward with cleaned arguments
        return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)

    # Apply patch to all LLaDALlamaBlock instances
    for name, module in model.named_modules():
        if module.__class__.__name__ == "LLaDALlamaBlock":
            # Store original forward
            module._original_forward = module.forward
            # Replace with patched version
            module.forward = types.MethodType(patched_forward, module)
            logger.info(f"Patched {name}")

    # Move model to GPU
    logger.info("Moving model to GPU...")
    model = model.to("cuda")

    logger.info("Setting up GPTQ quantizer...")
    quantizer = GPTQQuantizer(
        bits=4,
        group_size=128,
        desc_act=False,
        sym=True,
        true_sequential=True,
        dataset="c4",
        tokenizer=tokenizer,
        block_name_to_quantize="model.transformer.blocks",
    )

    logger.info("Quantizing model...")
    quantizer.quantize_model(model, tokenizer)

    logger.info("Quantization done, saving model...")
    output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq"
    logger.info(f"Saving model to {output_path}")
    quantizer.save(model, output_path)
    tokenizer.save_pretrained(output_path)

    logger.success("Quantization done, model saved!")


@app.local_entrypoint()
def main():
    quantize_model.remote()

I think I also had to manually change something in the config.json -- needed to be "block_name_to_quantize": "model.transformer.blocks" instead of whatever it was, maybe "model_block_name_to_quantize" as the key or something.

Downloads last month
3
Safetensors
Model size
1.97B params
Tensor type
I32
·
F16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for mrdmnd/llada-8b-instruct-4bit-gptq

Quantized
(8)
this model