amd
/

Llama-3.1-405B-Instruct-MXFP4-Preview

8-bit precision

Model card Files Files and versions Community

linzhao-amd commited on 24 days ago

Commit

7b7373c

·

verified ·

1 Parent(s): 0164045

Update README.md

Files changed (1) hide show

README.md +1 -0

README.md CHANGED Viewed

@@ -32,6 +32,7 @@ cd Quark/examples/torch/language_modeling/llm_ptq/
 python3 quantize_quark.py --model_dir "meta-llama/Llama-3.1-405B-Instruct" \
                           --model_attn_implementation "sdpa" \
                           --quant_scheme w_mxfp4_a_mxfp4 \
                           --kv_cache_dtype fp8 \
                           --quant_algo autosmoothquant \
                           --min_kv_scale 1.0 \

 python3 quantize_quark.py --model_dir "meta-llama/Llama-3.1-405B-Instruct" \
                           --model_attn_implementation "sdpa" \
                           --quant_scheme w_mxfp4_a_mxfp4 \
+                          --group_size 32 \
                           --kv_cache_dtype fp8 \
                           --quant_algo autosmoothquant \
                           --min_kv_scale 1.0 \