mrdmnd commited on
Commit
bd78f29
·
verified ·
1 Parent(s): 061017a

Initial readme

Browse files
Files changed (1) hide show
  1. README.md +165 -3
README.md CHANGED
@@ -1,3 +1,165 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model:
4
+ - GSAI-ML/LLaDA-8B-Instruct
5
+ pipeline_tag: text-generation
6
+ ---
7
+
8
+ Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got!
9
+
10
+ Used modal to do so:
11
+
12
+ ```
13
+ """
14
+ This script uses Modal to quantize the LLaDA family of models.
15
+
16
+ First, install modal and log into the CLI.
17
+
18
+ ```
19
+ uv add modal
20
+ uv run modal login
21
+ ```
22
+
23
+ Then, add an environment and a volume to the project.
24
+
25
+ ```
26
+ uv run modal volume create quantized-model-output
27
+ ```
28
+
29
+ Then, run the quantization script:
30
+
31
+ ```
32
+ uv run modal run scripts/quantize_llada.py
33
+ ```
34
+ """
35
+
36
+ import modal
37
+
38
+ image = (
39
+ modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13")
40
+ .apt_install("git", "curl")
41
+ .pip_install(
42
+ "torch>=2.7.0",
43
+ "torchvision",
44
+ "torchaudio",
45
+ index_url="https://download.pytorch.org/whl/cu128",
46
+ )
47
+ .pip_install(
48
+ "numpy",
49
+ "accelerate",
50
+ "optimum",
51
+ "loguru",
52
+ "transformers",
53
+ )
54
+ .pip_install("triton")
55
+ .pip_install("gptqmodel")
56
+ )
57
+
58
+ # These need to be created before running the app:
59
+ # uv run modal volume create quantized-model-output
60
+ output_volume = modal.Volume.from_name("quantized-model-output")
61
+ volume_config = {"/quantized-model-output": output_volume}
62
+
63
+ app = modal.App("quantize-llada", image=image, volumes=volume_config)
64
+
65
+ TRAIN_GPU_COUNT = 1
66
+ TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}"
67
+ TRAIN_CPU_COUNT = 4
68
+ MINUTES = 40
69
+
70
+
71
+ @app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60)
72
+ def quantize_model() -> None:
73
+ import types
74
+
75
+ import torch
76
+ from loguru import logger
77
+ from optimum.gptq import GPTQQuantizer
78
+ from transformers import AutoModel, AutoTokenizer
79
+
80
+ output_volume.reload()
81
+
82
+ # Check if CUDA is available, show device count
83
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
84
+ logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
85
+ logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
86
+ logger.info(f"CUDA version: {torch.version.cuda}")
87
+ logger.info(f"PyTorch version: {torch.__version__}")
88
+
89
+ # Check if GPU is available
90
+ if not torch.cuda.is_available():
91
+ raise RuntimeError("CUDA is not available")
92
+
93
+ logger.info("Loading tokenizer...")
94
+ tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)
95
+
96
+ # We need to do some shenanigans.
97
+ # First, load the model on the CPU so we can patch the forward pass.
98
+ logger.info("Loading model on CPU...")
99
+ model = AutoModel.from_pretrained(
100
+ "GSAI-ML/LLaDA-8B-Instruct",
101
+ trust_remote_code=True,
102
+ device_map="cpu",
103
+ torch_dtype=torch.float16,
104
+ low_cpu_mem_usage=True,
105
+ )
106
+ logger.info("Patching model forward pass...")
107
+
108
+ def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs):
109
+ """
110
+ Patched forward that handles both positional and keyword arguments for attention_bias
111
+ """
112
+ # If attention_bias was passed as positional argument, use it
113
+ if len(args) > 0 and attention_bias is None:
114
+ attention_bias = args[0]
115
+ args = args[1:]
116
+ if len(args) > 0 and layer_past is None:
117
+ layer_past = args[0]
118
+ args = args[1:]
119
+ if len(args) > 0:
120
+ use_cache = args[0]
121
+
122
+ # Call the original forward with cleaned arguments
123
+ return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
124
+
125
+ # Apply patch to all LLaDALlamaBlock instances
126
+ for name, module in model.named_modules():
127
+ if module.__class__.__name__ == "LLaDALlamaBlock":
128
+ # Store original forward
129
+ module._original_forward = module.forward
130
+ # Replace with patched version
131
+ module.forward = types.MethodType(patched_forward, module)
132
+ logger.info(f"Patched {name}")
133
+
134
+ # Move model to GPU
135
+ logger.info("Moving model to GPU...")
136
+ model = model.to("cuda")
137
+
138
+ logger.info("Setting up GPTQ quantizer...")
139
+ quantizer = GPTQQuantizer(
140
+ bits=4,
141
+ group_size=128,
142
+ desc_act=False,
143
+ sym=True,
144
+ true_sequential=True,
145
+ dataset="c4",
146
+ tokenizer=tokenizer,
147
+ block_name_to_quantize="model.transformer.blocks",
148
+ )
149
+
150
+ logger.info("Quantizing model...")
151
+ quantizer.quantize_model(model, tokenizer)
152
+
153
+ logger.info("Quantization done, saving model...")
154
+ output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq"
155
+ logger.info(f"Saving model to {output_path}")
156
+ quantizer.save(model, output_path)
157
+ tokenizer.save_pretrained(output_path)
158
+
159
+ logger.success("Quantization done, model saved!")
160
+
161
+
162
+ @app.local_entrypoint()
163
+ def main():
164
+ quantize_model.remote()
165
+ ```