File size: 5,121 Bytes
bd78f29
 
 
 
 
 
 
 
 
e4b8175
bd78f29
 
 
 
 
 
 
 
e4b8175
bd78f29
e4b8175
bd78f29
 
 
 
e4b8175
bd78f29
e4b8175
bd78f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58ed14d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
---
license: mit
base_model:
- GSAI-ML/LLaDA-8B-Instruct
pipeline_tag: text-generation
---

Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got!

Used modal to do so. If you want to replicate what I did, try this:

First, install modal and log into the CLI.

uv add modal
uv run modal login

Then, add an environment and a volume to the project.


uv run modal volume create quantized-model-output


Then, run the quantization script:

uv run modal run scripts/quantize_llada.py

```
# scripts/quantize_llada.py

import modal

image = (
    modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13")
    .apt_install("git", "curl")
    .pip_install(
        "torch>=2.7.0",
        "torchvision",
        "torchaudio",
        index_url="https://download.pytorch.org/whl/cu128",
    )
    .pip_install(
        "numpy",
        "accelerate",
        "optimum",
        "loguru",
        "transformers",
    )
    .pip_install("triton")
    .pip_install("gptqmodel")
)

# These need to be created before running the app:
# uv run modal volume create quantized-model-output
output_volume = modal.Volume.from_name("quantized-model-output")
volume_config = {"/quantized-model-output": output_volume}

app = modal.App("quantize-llada", image=image, volumes=volume_config)

TRAIN_GPU_COUNT = 1
TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}"
TRAIN_CPU_COUNT = 4
MINUTES = 40


@app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60)
def quantize_model() -> None:
    import types

    import torch
    from loguru import logger
    from optimum.gptq import GPTQQuantizer
    from transformers import AutoModel, AutoTokenizer

    output_volume.reload()

    # Check if CUDA is available, show device count
    logger.info(f"CUDA available: {torch.cuda.is_available()}")
    logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
    logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
    logger.info(f"CUDA version: {torch.version.cuda}")
    logger.info(f"PyTorch version: {torch.__version__}")

    # Check if GPU is available
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available")

    logger.info("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)

    # We need to do some shenanigans.
    # First, load the model on the CPU so we can patch the forward pass.
    logger.info("Loading model on CPU...")
    model = AutoModel.from_pretrained(
        "GSAI-ML/LLaDA-8B-Instruct",
        trust_remote_code=True,
        device_map="cpu",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )
    logger.info("Patching model forward pass...")

    def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs):
        """
        Patched forward that handles both positional and keyword arguments for attention_bias
        """
        # If attention_bias was passed as positional argument, use it
        if len(args) > 0 and attention_bias is None:
            attention_bias = args[0]
            args = args[1:]
        if len(args) > 0 and layer_past is None:
            layer_past = args[0]
            args = args[1:]
        if len(args) > 0:
            use_cache = args[0]

        # Call the original forward with cleaned arguments
        return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)

    # Apply patch to all LLaDALlamaBlock instances
    for name, module in model.named_modules():
        if module.__class__.__name__ == "LLaDALlamaBlock":
            # Store original forward
            module._original_forward = module.forward
            # Replace with patched version
            module.forward = types.MethodType(patched_forward, module)
            logger.info(f"Patched {name}")

    # Move model to GPU
    logger.info("Moving model to GPU...")
    model = model.to("cuda")

    logger.info("Setting up GPTQ quantizer...")
    quantizer = GPTQQuantizer(
        bits=4,
        group_size=128,
        desc_act=False,
        sym=True,
        true_sequential=True,
        dataset="c4",
        tokenizer=tokenizer,
        block_name_to_quantize="model.transformer.blocks",
    )

    logger.info("Quantizing model...")
    quantizer.quantize_model(model, tokenizer)

    logger.info("Quantization done, saving model...")
    output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq"
    logger.info(f"Saving model to {output_path}")
    quantizer.save(model, output_path)
    tokenizer.save_pretrained(output_path)

    logger.success("Quantization done, model saved!")


@app.local_entrypoint()
def main():
    quantize_model.remote()
```

I think I also had to manually change something in the config.json -- needed to be "block_name_to_quantize": "model.transformer.blocks" instead of whatever it was, maybe "model_block_name_to_quantize" as the key or something.