Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +34 -7
config.json +12 -3
model.safetensors +2 -2

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ base_model:
 This tiny model is for debugging. It is randomly initialized with the config adapted from [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b).
-Note: This model is in BF16; quantized MXFP4 FFN is not used.
 ### Example usage:
@@ -33,8 +33,8 @@ model_id = "yujiepan/gpt-oss-tiny-random"
 pipe = pipeline(
     "text-generation",
     model=model_id,
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
 )
 messages = [
@@ -53,6 +53,7 @@ print(outputs[0]["generated_text"][-1])
 ```python
 import json
 import torch
 from huggingface_hub import hf_hub_download
 from transformers import (
@@ -76,7 +77,7 @@ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='mo
     config_json = json.load(f)
 config_json.update({
     "head_dim": 32,
-    "hidden_size": 16,
     "intermediate_size": 64,
     "layer_types": ["sliding_attention", "full_attention"],
     "num_attention_heads": 2,
@@ -93,7 +94,7 @@ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
 config = AutoConfig.from_pretrained(save_folder)
 print(config)
 torch.set_default_dtype(torch.bfloat16)
-model = AutoModelForCausalLM.from_config(config)
 torch.set_default_dtype(torch.float32)
 model.generation_config = GenerationConfig.from_pretrained(
     source_model_id, trust_remote_code=True,
@@ -106,6 +107,32 @@ with torch.no_grad():
 model.save_pretrained(save_folder)
 # mxfp4
-# model = AutoModelForCausalLM.from_pretrained(save_folder, trust_remote_code=True, torch_dtype=torch.bfloat16, quantization_config=quantization_config)
-# model.save_pretrained(save_folder, safe_serialization=True)
 ```

 This tiny model is for debugging. It is randomly initialized with the config adapted from [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b).
+Note: This model used uantized MXFP4 FFN. `pip install -U triton git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels`
 ### Example usage:
 pipe = pipeline(
     "text-generation",
     model=model_id,
+    torch_dtype='auto',
+    device_map="cuda",
 )
 messages = [
 ```python
 import json
+import safetensors
 import torch
 from huggingface_hub import hf_hub_download
 from transformers import (
     config_json = json.load(f)
 config_json.update({
     "head_dim": 32,
+    "hidden_size": 32,  # required by Mxfp4GptOssExperts codes
     "intermediate_size": 64,
     "layer_types": ["sliding_attention", "full_attention"],
     "num_attention_heads": 2,
 config = AutoConfig.from_pretrained(save_folder)
 print(config)
 torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
 torch.set_default_dtype(torch.float32)
 model.generation_config = GenerationConfig.from_pretrained(
     source_model_id, trust_remote_code=True,
 model.save_pretrained(save_folder)
 # mxfp4
+from unittest.mock import Mock
+from transformers.quantizers.auto import AutoHfQuantizer
+from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
+_get_device_capability = torch.cuda.get_device_capability
+torch.cuda.get_device_capability = Mock(return_value=(9, 0))
+set_seed(42)
+bf16_state_dict = model.cuda().state_dict()
+hf_quantizer: Mxfp4HfQuantizer = AutoHfQuantizer.from_config(quantization_config)
+hf_quantizer.pre_quantized = False
+ffn_keys = ['model.layers.0.mlp.experts.down_proj', 'model.layers.0.mlp.experts.gate_up_proj',
+            'model.layers.1.mlp.experts.down_proj', 'model.layers.1.mlp.experts.gate_up_proj']
+for key in ffn_keys:
+    hf_quantizer.create_quantized_param(model, bf16_state_dict[key], key, "cuda", bf16_state_dict)
+state_dict = model.state_dict()
+del state_dict['lm_head.weight']
+for key in ffn_keys:
+    del state_dict[key]
+for k, v in state_dict.items():
+    if str(v.device) == 'meta':
+        print(k, v.device, v.shape)
+safetensors.torch.save_file(state_dict, f"{save_folder}/model.safetensors")
+with open(f"{save_folder}/config.json", "r", encoding='utf-8') as f:
+    config = json.load(f)
+config['quantization_config'] = quantization_config
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config, f, indent=2)
 ```

config.json CHANGED Viewed

@@ -8,7 +8,7 @@
   "experts_per_token": 4,
   "head_dim": 32,
   "hidden_act": "silu",
-  "hidden_size": 16,
   "initial_context_length": 4096,
   "initializer_range": 0.02,
   "intermediate_size": 64,
@@ -42,5 +42,14 @@
   "torch_dtype": "bfloat16",
   "transformers_version": "4.56.0.dev0",
   "use_cache": true,
-  "vocab_size": 201088
-}

   "experts_per_token": 4,
   "head_dim": 32,
   "hidden_act": "silu",
+  "hidden_size": 32,
   "initial_context_length": 4096,
   "initializer_range": 0.02,
   "intermediate_size": 64,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.56.0.dev0",
   "use_cache": true,
+  "vocab_size": 201088,
+  "quantization_config": {
+    "modules_to_not_convert": [
+      "model.layers.*.self_attn",
+      "model.layers.*.mlp.router",
+      "model.embed_tokens",
+      "lm_head"
+    ],
+    "quant_method": "mxfp4"
+  }
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:304de707ed9cbe5bc3a7e574bb7da85f7afcbb9733cb39ada46200a17b740308
-size 6865464

 version https://git-lfs.github.com/spec/v1
+oid sha256:49ed1200e8437107e3a989c5582f4309c0eca660565638a6c4d95600d116b0f8
+size 12923264