yujiepan commited on
Commit
ea48c64
·
verified ·
1 Parent(s): c685004

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: text-generation
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ base_model:
10
+ - stepfun-ai/step3
11
+ ---
12
+
13
+ This tiny model is for debugging. It is randomly initialized with the config adapted from [stepfun-ai/step3](https://huggingface.co/stepfun-ai/step3).
14
+
15
+ Note: if you want the model version that follows transformers' naming, see the model without "-vllm" suffix.
16
+
17
+ ### Example usage:
18
+
19
+ - vLLM
20
+
21
+ ```bash
22
+ vllm serve yujiepan/step3-tiny-random-vllm --trust-remote-code
23
+ ```
24
+
25
+ - Transformers
26
+
27
+ ```python
28
+ # Note: it's more convenient to use the model without "-vllm" suffix, which follows transformers' naming. Here "key_mapping" is a workaround.
29
+
30
+ import torch
31
+ from transformers import AutoModelForCausalLM, AutoProcessor
32
+
33
+ model_id = "yujiepan/step3-tiny-random-vllm"
34
+ key_mapping = {
35
+ "^vision_model": "model.vision_model",
36
+ r"^model(?!\.(language_model|vision_model))": "model.language_model",
37
+ "vit_downsampler": "model.vit_downsampler",
38
+ "vit_downsampler2": "model.vit_downsampler2",
39
+ "vit_large_projector": "model.vit_large_projector",
40
+ }
41
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
42
+ model = AutoModelForCausalLM.from_pretrained(
43
+ model_id,
44
+ device_map="cuda", torch_dtype=torch.bfloat16,
45
+ trust_remote_code=True, key_mapping=key_mapping,
46
+ )
47
+ messages = [
48
+ {
49
+ "role": "user",
50
+ "content": [
51
+ {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
52
+ {"type": "text", "text": "What's in this picture?"}
53
+ ]
54
+ },
55
+ ]
56
+ inputs = processor.apply_chat_template(
57
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
58
+ ).to(model.device)
59
+ generate_ids = model.generate(**inputs, max_new_tokens=32, do_sample=False)
60
+ decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
61
+ print(decoded)
62
+ ```
63
+
64
+ ### Codes to create this repo:
65
+
66
+ ```python
67
+ import json
68
+ from pathlib import Path
69
+
70
+ import accelerate
71
+ import torch
72
+ from huggingface_hub import file_exists, hf_hub_download
73
+ from transformers import (
74
+ AutoConfig,
75
+ AutoModelForCausalLM,
76
+ AutoProcessor,
77
+ AutoTokenizer,
78
+ GenerationConfig,
79
+ set_seed,
80
+ )
81
+
82
+ source_model_id = "stepfun-ai/step3"
83
+ save_folder = "/tmp/yujiepan/step3-tiny-random-vllm"
84
+
85
+ processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
86
+ processor.save_pretrained(save_folder)
87
+
88
+ def rewrite_automap(filepath: str, source_model_id: str, overrides: dict = None):
89
+ import json
90
+ with open(filepath, 'r', encoding='utf-8') as f:
91
+ config = json.load(f)
92
+ for k, v in config['auto_map'].items():
93
+ v = v.split('--')[-1]
94
+ config['auto_map'][k] = f'{source_model_id}--{v}'
95
+ if overrides is not None:
96
+ config.update(overrides)
97
+ with open(filepath, 'w', encoding='utf - 8') as f:
98
+ json.dump(config, f, indent=2)
99
+
100
+ rewrite_automap(f'{save_folder}/processor_config.json', source_model_id)
101
+ rewrite_automap(f'{save_folder}/tokenizer_config.json', source_model_id)
102
+
103
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
104
+ config_json = json.load(f)
105
+
106
+ for k, v in config_json['auto_map'].items():
107
+ config_json['auto_map'][k] = f'{source_model_id}--{v}'
108
+ config_json['architectures'] = ["Step3VLForConditionalGeneration"]
109
+ config_json['text_config'].update({
110
+ "hidden_size": 32,
111
+ "intermediate_size": 64,
112
+ "num_hidden_layers": 2,
113
+ "num_attention_heads": 2,
114
+ "num_attention_groups": 1,
115
+ "head_dim": 256,
116
+ "share_q_dim": 512,
117
+ "moe_layers_enum": "1",
118
+ "moe_num_experts": 8,
119
+ "moe_top_k": 3,
120
+ "moe_intermediate_size": 64,
121
+ "share_expert_dim": 64,
122
+ "tie_word_embeddings": True,
123
+ })
124
+ config_json['vision_config'].update({
125
+ "hidden_size": 64,
126
+ "output_hidden_size": 64,
127
+ "intermediate_size": 128,
128
+ "num_hidden_layers": 2,
129
+ "num_attention_heads": 2
130
+ })
131
+
132
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
133
+ json.dump(config_json, f, indent=2)
134
+ config = AutoConfig.from_pretrained(
135
+ save_folder,
136
+ trust_remote_code=True,
137
+ )
138
+ print(config)
139
+ automap = config_json['auto_map']
140
+ torch.set_default_dtype(torch.bfloat16)
141
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
142
+ torch.set_default_dtype(torch.float32)
143
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
144
+ model.generation_config = GenerationConfig.from_pretrained(
145
+ source_model_id, trust_remote_code=True,
146
+ )
147
+ set_seed(42)
148
+ model = model.cpu() # cpu is more stable for random initialization across machines
149
+ with torch.no_grad():
150
+ for name, p in sorted(model.named_parameters()):
151
+ torch.nn.init.normal_(p, 0, 0.2)
152
+ print(name, p.shape)
153
+
154
+ model.save_pretrained(save_folder)
155
+
156
+ import safetensors
157
+ new_tensors = {}
158
+ with safetensors.safe_open(f'{save_folder}/model.safetensors', framework='pt', device='cpu') as f:
159
+ for k in list(f.keys()):
160
+ v = f.get_tensor(k)
161
+ if k.startswith('model.language_model.'):
162
+ k = k.replace('model.language_model.', 'model.')
163
+ new_tensors[k] = v
164
+ elif k.startswith('model.vi'):
165
+ k = k.replace('model.vi', 'vi')
166
+ new_tensors[k] = v
167
+ else:
168
+ new_tensors[k] = v
169
+ safetensors.torch.save_file(new_tensors, f"{save_folder}/model.safetensors")
170
+
171
+ rewrite_automap(
172
+ f'{save_folder}/config.json', source_model_id,
173
+ overrides=dict(architectures=['Step3VLForConditionalGeneration']),
174
+ )
175
+ for python_file in Path(save_folder).glob('*.py'):
176
+ if python_file.name.startswith('modeling_') or python_file.name.startswith('configuration_') or python_file.name.endswith('.py'):
177
+ python_file.unlink()
178
+ ```
179
+
180
+ ### Printing the model:
181
+
182
+ ```text
183
+ Step3vForConditionalGeneration(
184
+ (model): Step3vModel(
185
+ (vision_model): StepCLIPVisionTransformer(
186
+ (embeddings): StepCLIPVisionEmbeddings(
187
+ (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
188
+ (position_embedding): Embedding(2705, 64)
189
+ )
190
+ (transformer): StepCLIPEncoder(
191
+ (layers): ModuleList(
192
+ (0-1): 2 x StepCLIPEncoderLayer(
193
+ (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
194
+ (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
195
+ (self_attn): StepCLIPAttention(
196
+ (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
197
+ (out_proj): Linear(in_features=64, out_features=64, bias=True)
198
+ )
199
+ (mlp): StepCLIPMLP(
200
+ (fc1): Linear(in_features=64, out_features=128, bias=True)
201
+ (act): QuickGELUActivation()
202
+ (fc2): Linear(in_features=128, out_features=64, bias=True)
203
+ )
204
+ )
205
+ )
206
+ )
207
+ )
208
+ (language_model): Step3Model(
209
+ (embed_tokens): Embedding(128815, 32)
210
+ (layers): ModuleList(
211
+ (0): Step3vDecoderLayer(
212
+ (self_attn): Step3vAttention(
213
+ (q_proj): Linear(in_features=32, out_features=512, bias=False)
214
+ (k_proj): Linear(in_features=32, out_features=256, bias=False)
215
+ (v_proj): Linear(in_features=32, out_features=256, bias=False)
216
+ (o_proj): Linear(in_features=512, out_features=32, bias=False)
217
+ (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
218
+ (wq): Linear(in_features=512, out_features=512, bias=False)
219
+ )
220
+ (mlp): Step3vMLP(
221
+ (gate_proj): Linear(in_features=32, out_features=64, bias=False)
222
+ (up_proj): Linear(in_features=32, out_features=64, bias=False)
223
+ (down_proj): Linear(in_features=64, out_features=32, bias=False)
224
+ (act_fn): SiLU()
225
+ )
226
+ (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
227
+ (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
228
+ )
229
+ (1): Step3vDecoderLayer(
230
+ (self_attn): Step3vAttention(
231
+ (q_proj): Linear(in_features=32, out_features=512, bias=False)
232
+ (k_proj): Linear(in_features=32, out_features=256, bias=False)
233
+ (v_proj): Linear(in_features=32, out_features=256, bias=False)
234
+ (o_proj): Linear(in_features=512, out_features=32, bias=False)
235
+ (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
236
+ (wq): Linear(in_features=512, out_features=512, bias=False)
237
+ )
238
+ (moe): Step3vMoEMLP(
239
+ (gate): Linear(in_features=32, out_features=8, bias=False)
240
+ (up_proj): MoELinear()
241
+ (gate_proj): MoELinear()
242
+ (down_proj): MoELinear()
243
+ (act_fn): SiLU()
244
+ )
245
+ (share_expert): Step3vMLP(
246
+ (gate_proj): Linear(in_features=32, out_features=64, bias=False)
247
+ (up_proj): Linear(in_features=32, out_features=64, bias=False)
248
+ (down_proj): Linear(in_features=64, out_features=32, bias=False)
249
+ (act_fn): SiLU()
250
+ )
251
+ (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
252
+ (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
253
+ )
254
+ )
255
+ (norm): Step3vRMSNorm((32,), eps=1e-05)
256
+ (rotary_emb): Step3vRotaryEmbedding()
257
+ )
258
+ (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
259
+ (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
260
+ (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
261
+ )
262
+ (lm_head): Linear(in_features=32, out_features=128815, bias=False)
263
+ )
264
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% macro render_content(content) %} {% if content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %} {% endmacro %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message.role == 'system' %}{{ render_content(message['content']) }}{% endif %}{% endfor %}{% if tools is defined and tools %}{% set ns = namespace(data='') %}{% for tool in tools %}{% set ns.data = ns.data + (tool | tojson(ensure_ascii=False)) + '
2
+ ' %}{% endfor %}{% set tool_schemas_var = ns.data %}# Tools
3
+ You may call one or more tools to assist with the user query. You are provided with tool schemas within <tools></tools> XML tags: <tools>{{ tool_schemas_var }}</tools> When making tool calls, use XML format to invoke tools and pass parameters: <|tool_calls_begin|>
4
+ <|tool_call_begin|>
5
+ function<|tool_sep|><steptml:invoke name="tool_name0"><steptml:parameter name="parameter_name0">[parameter value]</steptml:parameter>...</steptml:invoke><|tool_call_end|>
6
+ <|tool_call_begin|>
7
+ function<|tool_sep|><steptml:invoke name="tool_name1"><steptml:parameter name="parameter_name1">[parameter value]</steptml:parameter>...</steptml:invoke><|tool_call_end|>
8
+ <|tool_calls_end|>
9
+ Note: * You can invoke one or more tools in parallel. * Each tool call must be complete and self-contained within a single <steptml:toolcall></steptml:toolcall> block. {% endif %}{% for message in messages %}{% if message.role == 'tool_description' %}{{ render_content(message['content']) }}{% elif message.role == 'user' %}{{- '<|BOT|>' + message.role + '\n' + render_content(message['content']) }}{{- '<|EOT|>' }}{% elif message.role == 'tool_response' %}<|tool_outputs_begin|>
10
+ {% for tool_output in message['content'] %}<|tool_output_begin|>
11
+ {{ render_content(tool_output) }}<|tool_output_end|>{% endfor %}
12
+ <|tool_outputs_end|>
13
+ {% else %}{{- '<|BOT|>' + message.role + '
14
+ ' }}{% if message['content'] is defined %}{{- render_content(message['content']) }}{% endif %}{% if message.tool_calls is defined %}<|tool_calls_begin|>
15
+ {% for tool in message.tool_calls %}<|tool_call_begin>|>
16
+ {{ tool['type'] }}<|tool_sep|>{{- '<steptml:invoke name="' + tool['function']['name'] + '">' }}{% for name, param in tool['function']['arguments'].items() %} {{- '<steptml:parameter name="' + name + '">' + param | string + '</steptml:parameter>' }}{% endfor %}</steptml:invoke><|tool_call_end|>
17
+ {% endfor %}<|tool_calls_end|>
18
+ {% endif %}<|EOT|>{% endif %}{% endfor %}{% if add_generation_prompt %}{{- '<|BOT|>assistant
19
+ <think>
20
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Step3VLForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "stepfun-ai/step3--configuration_step3.Step3VLConfig",
7
+ "AutoModelForCausalLM": "stepfun-ai/step3--modeling_step3.Step3vForConditionalGeneration"
8
+ },
9
+ "bos_token_id": 0,
10
+ "eos_token_id": 128805,
11
+ "hidden_size": 32,
12
+ "im_end_token": "<im_end>",
13
+ "im_patch_token": "<im_patch>",
14
+ "im_start_token": "<im_start>",
15
+ "image_token_id": 128001,
16
+ "image_token_len": 169,
17
+ "model_type": "step3_vl",
18
+ "patch_token_len": 81,
19
+ "projector_bias": false,
20
+ "text_config": {
21
+ "architectures": [
22
+ "Step3TextForCausalLM"
23
+ ],
24
+ "head_dim": 256,
25
+ "hidden_size": 32,
26
+ "intermediate_size": 64,
27
+ "max_position_embedding": 65536,
28
+ "max_seq_len": 65536,
29
+ "model_type": "step3_text",
30
+ "moe_intermediate_size": 64,
31
+ "moe_layers_enum": "1",
32
+ "moe_num_experts": 8,
33
+ "moe_top_k": 3,
34
+ "norm_expert_weight": false,
35
+ "num_attention_groups": 1,
36
+ "num_attention_heads": 2,
37
+ "num_hidden_layers": 2,
38
+ "rms_norm_eps": 1e-05,
39
+ "rope_scaling": null,
40
+ "rope_theta": 500000,
41
+ "share_expert_dim": 64,
42
+ "share_q_dim": 512,
43
+ "torch_dtype": "bfloat16",
44
+ "vocab_size": 128815
45
+ },
46
+ "torch_dtype": "bfloat16",
47
+ "transformers_version": "4.54.1",
48
+ "understand_projector_stride": 2,
49
+ "vision_config": {
50
+ "hidden_act": "quick_gelu",
51
+ "hidden_size": 64,
52
+ "image_size": 728,
53
+ "intermediate_size": 128,
54
+ "layer_norm_eps": 1e-05,
55
+ "model_type": "step3_vision_encoder",
56
+ "num_attention_heads": 2,
57
+ "num_channels": 3,
58
+ "num_hidden_layers": 2,
59
+ "output_hidden_size": 64,
60
+ "patch_size": 14
61
+ }
62
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "do_sample": true,
4
+ "eos_token_id": 128805,
5
+ "temperature": 0.7,
6
+ "top_p": 0.95,
7
+ "transformers_version": "4.54.1",
8
+ "trust_remote_code": true
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e46470c9bafefcc89fb8189e085a3496f3500d9f7b873cef07f64b3e76c0fab
3
+ size 18610672
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "stepfun-ai/step3--processing_step3.Step3VLProcessor"
4
+ },
5
+ "processor_class": "Step3VLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff