yujiepan commited on
Commit
125415b
·
verified ·
1 Parent(s): 58ea2cb

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +6 -32
  2. config.json +2 -11
  3. model.safetensors +2 -2
README.md CHANGED
@@ -12,7 +12,7 @@ base_model:
12
 
13
  This tiny model is for debugging. It is randomly initialized with the config adapted from [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b).
14
 
15
- Note: This model used uantized MXFP4 FFN. `pip install -U triton git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels`
16
 
17
  ### Example usage:
18
 
@@ -33,8 +33,8 @@ model_id = "yujiepan/gpt-oss-tiny-random"
33
  pipe = pipeline(
34
  "text-generation",
35
  model=model_id,
36
- torch_dtype='auto',
37
- device_map="cuda",
38
  )
39
 
40
  messages = [
@@ -53,7 +53,6 @@ print(outputs[0]["generated_text"][-1])
53
  ```python
54
  import json
55
 
56
- import safetensors
57
  import torch
58
  from huggingface_hub import hf_hub_download
59
  from transformers import (
@@ -94,7 +93,7 @@ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
94
  config = AutoConfig.from_pretrained(save_folder)
95
  print(config)
96
  torch.set_default_dtype(torch.bfloat16)
97
- model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
98
  torch.set_default_dtype(torch.float32)
99
  model.generation_config = GenerationConfig.from_pretrained(
100
  source_model_id, trust_remote_code=True,
@@ -107,32 +106,7 @@ with torch.no_grad():
107
  model.save_pretrained(save_folder)
108
 
109
  # mxfp4
110
- from unittest.mock import Mock
111
-
112
- from transformers.quantizers.auto import AutoHfQuantizer
113
  from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
114
- _get_device_capability = torch.cuda.get_device_capability
115
- torch.cuda.get_device_capability = Mock(return_value=(9, 0))
116
- set_seed(42)
117
- bf16_state_dict = model.cuda().state_dict()
118
- hf_quantizer: Mxfp4HfQuantizer = AutoHfQuantizer.from_config(quantization_config)
119
- hf_quantizer.pre_quantized = False
120
- ffn_keys = ['model.layers.0.mlp.experts.down_proj', 'model.layers.0.mlp.experts.gate_up_proj',
121
- 'model.layers.1.mlp.experts.down_proj', 'model.layers.1.mlp.experts.gate_up_proj']
122
- for key in ffn_keys:
123
- hf_quantizer.create_quantized_param(model, bf16_state_dict[key], key, "cuda", bf16_state_dict)
124
- state_dict = model.state_dict()
125
- del state_dict['lm_head.weight']
126
- for key in ffn_keys:
127
- del state_dict[key]
128
- for k, v in state_dict.items():
129
- if str(v.device) == 'meta':
130
- print(k, v.device, v.shape)
131
-
132
- safetensors.torch.save_file(state_dict, f"{save_folder}/model.safetensors")
133
- with open(f"{save_folder}/config.json", "r", encoding='utf-8') as f:
134
- config = json.load(f)
135
- config['quantization_config'] = quantization_config
136
- with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
137
- json.dump(config, f, indent=2)
138
  ```
 
12
 
13
  This tiny model is for debugging. It is randomly initialized with the config adapted from [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b).
14
 
15
+ Note: This model is in BF16; quantized MXFP4 FFN is not used.
16
 
17
  ### Example usage:
18
 
 
33
  pipe = pipeline(
34
  "text-generation",
35
  model=model_id,
36
+ torch_dtype=torch.bfloat16,
37
+ device_map="cuda"
38
  )
39
 
40
  messages = [
 
53
  ```python
54
  import json
55
 
 
56
  import torch
57
  from huggingface_hub import hf_hub_download
58
  from transformers import (
 
93
  config = AutoConfig.from_pretrained(save_folder)
94
  print(config)
95
  torch.set_default_dtype(torch.bfloat16)
96
+ model = AutoModelForCausalLM.from_config(config)
97
  torch.set_default_dtype(torch.float32)
98
  model.generation_config = GenerationConfig.from_pretrained(
99
  source_model_id, trust_remote_code=True,
 
106
  model.save_pretrained(save_folder)
107
 
108
  # mxfp4
 
 
 
109
  from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
110
+ # model = AutoModelForCausalLM.from_pretrained(save_folder, trust_remote_code=True, torch_dtype=torch.bfloat16, quantization_config=quantization_config)
111
+ # model.save_pretrained(save_folder, safe_serialization=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  ```
config.json CHANGED
@@ -42,14 +42,5 @@
42
  "torch_dtype": "bfloat16",
43
  "transformers_version": "4.56.0.dev0",
44
  "use_cache": true,
45
- "vocab_size": 201088,
46
- "quantization_config": {
47
- "modules_to_not_convert": [
48
- "model.layers.*.self_attn",
49
- "model.layers.*.mlp.router",
50
- "model.embed_tokens",
51
- "lm_head"
52
- ],
53
- "quant_method": "mxfp4"
54
- }
55
- }
 
42
  "torch_dtype": "bfloat16",
43
  "transformers_version": "4.56.0.dev0",
44
  "use_cache": true,
45
+ "vocab_size": 201088
46
+ }
 
 
 
 
 
 
 
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49ed1200e8437107e3a989c5582f4309c0eca660565638a6c4d95600d116b0f8
3
- size 12923264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aefe8b9c4b4969f6d13c5d778760f3dce4e25134324b33677934550d9df02a7c
3
+ size 13710176