Generation

Requires: https://github.com/vllm-project/llm-compressor/pull/1788


from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier

MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"

SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"


# Configure the quantization algorithm to run.
recipe = [
    AWQModifier(
        duo_scaling=False,
        ignore=[
            "lm_head",
            "re:.*mlp.gate$",
            "re:.*mlp.shared_expert_gate$",
            "re:visual.*",
        ],
        scheme="W4A16",
        targets=["Linear"],
    ),
]

# Select calibration dataset.
DATASET_ID = "codeparrot/self-instruct-starcoder"
DATASET_SPLIT = "curated"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048


def get_calib_dataset(tokenizer):
    from datasets import load_dataset

    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
    )

    def preprocess(example):
        chat_messages = [
            {"role": "user", "content": example["instruction"].strip()},
            {"role": "assistant", "content": example["output"].strip()},
        ]
        tokenized_messages = tokenizer.apply_chat_template(
            chat_messages, tokenize=True
        )
        return {"input_ids": tokenized_messages}

    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )

    return ds


if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype="auto", trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ###
    ### Apply algorithms.
    ###
    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer),
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        log_dir=None,
        trust_remote_code_model=True,
    )

    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)

Evaluation

The model was evaluated on HumanEval and HumanEval+ benchmark with the Neural Magic fork of the EvalPlus implementation of HumanEval+ and the vLLM engine, using the following commands:

python evalplus/codegen/generate.py --model nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq --bs 16 --temperature 0.2 --n_samples 50 --root "./results" --dataset humaneval --backend vllm --dtype auto 

python evalplus/evalplus/sanitize.py results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2

evalplus.evaluate --dataset humaneval --samples results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2-sanitized
Metric Qwen/Qwen3-Coder-30B-A3B-Instruct nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq
HumanEval pass@1 93.0 93.7
HumanEval pass@10 93.9 94.5
HumanEval+ pass@1 88.7 89.3
HumanEval+ pass@10 89.8 90.2
Average Score 91.35 91.93
Downloads last month
73,052
Safetensors
Model size
5B params
Tensor type
I64
·
I32
·
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq

Quantized
(127)
this model