|
--- |
|
license: mit |
|
base_model: |
|
- GSAI-ML/LLaDA-8B-Instruct |
|
pipeline_tag: text-generation |
|
--- |
|
|
|
Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got! |
|
|
|
Used modal to do so. If you want to replicate what I did, try this: |
|
|
|
First, install modal and log into the CLI. |
|
|
|
uv add modal |
|
uv run modal login |
|
|
|
Then, add an environment and a volume to the project. |
|
|
|
|
|
uv run modal volume create quantized-model-output |
|
|
|
|
|
Then, run the quantization script: |
|
|
|
uv run modal run scripts/quantize_llada.py |
|
|
|
``` |
|
# scripts/quantize_llada.py |
|
|
|
import modal |
|
|
|
image = ( |
|
modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13") |
|
.apt_install("git", "curl") |
|
.pip_install( |
|
"torch>=2.7.0", |
|
"torchvision", |
|
"torchaudio", |
|
index_url="https://download.pytorch.org/whl/cu128", |
|
) |
|
.pip_install( |
|
"numpy", |
|
"accelerate", |
|
"optimum", |
|
"loguru", |
|
"transformers", |
|
) |
|
.pip_install("triton") |
|
.pip_install("gptqmodel") |
|
) |
|
|
|
# These need to be created before running the app: |
|
# uv run modal volume create quantized-model-output |
|
output_volume = modal.Volume.from_name("quantized-model-output") |
|
volume_config = {"/quantized-model-output": output_volume} |
|
|
|
app = modal.App("quantize-llada", image=image, volumes=volume_config) |
|
|
|
TRAIN_GPU_COUNT = 1 |
|
TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}" |
|
TRAIN_CPU_COUNT = 4 |
|
MINUTES = 40 |
|
|
|
|
|
@app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60) |
|
def quantize_model() -> None: |
|
import types |
|
|
|
import torch |
|
from loguru import logger |
|
from optimum.gptq import GPTQQuantizer |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
output_volume.reload() |
|
|
|
# Check if CUDA is available, show device count |
|
logger.info(f"CUDA available: {torch.cuda.is_available()}") |
|
logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}") |
|
logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}") |
|
logger.info(f"CUDA version: {torch.version.cuda}") |
|
logger.info(f"PyTorch version: {torch.__version__}") |
|
|
|
# Check if GPU is available |
|
if not torch.cuda.is_available(): |
|
raise RuntimeError("CUDA is not available") |
|
|
|
logger.info("Loading tokenizer...") |
|
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True) |
|
|
|
# We need to do some shenanigans. |
|
# First, load the model on the CPU so we can patch the forward pass. |
|
logger.info("Loading model on CPU...") |
|
model = AutoModel.from_pretrained( |
|
"GSAI-ML/LLaDA-8B-Instruct", |
|
trust_remote_code=True, |
|
device_map="cpu", |
|
torch_dtype=torch.float16, |
|
low_cpu_mem_usage=True, |
|
) |
|
logger.info("Patching model forward pass...") |
|
|
|
def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs): |
|
""" |
|
Patched forward that handles both positional and keyword arguments for attention_bias |
|
""" |
|
# If attention_bias was passed as positional argument, use it |
|
if len(args) > 0 and attention_bias is None: |
|
attention_bias = args[0] |
|
args = args[1:] |
|
if len(args) > 0 and layer_past is None: |
|
layer_past = args[0] |
|
args = args[1:] |
|
if len(args) > 0: |
|
use_cache = args[0] |
|
|
|
# Call the original forward with cleaned arguments |
|
return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache) |
|
|
|
# Apply patch to all LLaDALlamaBlock instances |
|
for name, module in model.named_modules(): |
|
if module.__class__.__name__ == "LLaDALlamaBlock": |
|
# Store original forward |
|
module._original_forward = module.forward |
|
# Replace with patched version |
|
module.forward = types.MethodType(patched_forward, module) |
|
logger.info(f"Patched {name}") |
|
|
|
# Move model to GPU |
|
logger.info("Moving model to GPU...") |
|
model = model.to("cuda") |
|
|
|
logger.info("Setting up GPTQ quantizer...") |
|
quantizer = GPTQQuantizer( |
|
bits=4, |
|
group_size=128, |
|
desc_act=False, |
|
sym=True, |
|
true_sequential=True, |
|
dataset="c4", |
|
tokenizer=tokenizer, |
|
block_name_to_quantize="model.transformer.blocks", |
|
) |
|
|
|
logger.info("Quantizing model...") |
|
quantizer.quantize_model(model, tokenizer) |
|
|
|
logger.info("Quantization done, saving model...") |
|
output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq" |
|
logger.info(f"Saving model to {output_path}") |
|
quantizer.save(model, output_path) |
|
tokenizer.save_pretrained(output_path) |
|
|
|
logger.success("Quantization done, model saved!") |
|
|
|
|
|
@app.local_entrypoint() |
|
def main(): |
|
quantize_model.remote() |
|
``` |
|
|
|
I think I also had to manually change something in the config.json -- needed to be "block_name_to_quantize": "model.transformer.blocks" instead of whatever it was, maybe "model_block_name_to_quantize" as the key or something. |