yujiepan commited on
Commit
fe309fc
·
verified ·
1 Parent(s): 5dcc599

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: image-text-to-text
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ base_model:
10
+ - zai-org/GLM-4.5V
11
+ ---
12
+
13
+ This tiny model is for debugging. It is randomly initialized with the config adapted from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
14
+
15
+ ### Example usage:
16
+
17
+ ```python
18
+ import torch
19
+ from transformers import AutoProcessor, Glm4vMoeForConditionalGeneration
20
+
21
+ model_id = "tiny-random/glm-4.5v"
22
+ messages = [
23
+ {
24
+ "role": "user",
25
+ "content": [
26
+ {
27
+ "type": "image",
28
+ "url": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
29
+ },
30
+ {
31
+ "type": "text",
32
+ "text": "describe this image"
33
+ }
34
+ ],
35
+ }
36
+ ]
37
+ processor = AutoProcessor.from_pretrained(model_id)
38
+ model = Glm4vMoeForConditionalGeneration.from_pretrained(
39
+ model_id,
40
+ torch_dtype=torch.bfloat16,
41
+ device_map="auto",
42
+ )
43
+ inputs = processor.apply_chat_template(
44
+ messages,
45
+ tokenize=True,
46
+ add_generation_prompt=True,
47
+ return_dict=True,
48
+ return_tensors="pt"
49
+ ).to(model.device)
50
+ inputs.pop("token_type_ids", None)
51
+ generated_ids = model.generate(**inputs, max_new_tokens=16)
52
+ output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
53
+ print(output_text)
54
+ ```
55
+
56
+ ### Codes to create this repo:
57
+
58
+ ```python
59
+ import json
60
+ from pathlib import Path
61
+
62
+ import accelerate
63
+ import torch
64
+ from huggingface_hub import file_exists, hf_hub_download
65
+ from transformers import (
66
+ AutoConfig,
67
+ AutoModelForCausalLM,
68
+ AutoProcessor,
69
+ GenerationConfig,
70
+ Glm4vForConditionalGeneration,
71
+ Glm4vMoeForConditionalGeneration,
72
+ set_seed,
73
+ )
74
+ from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextTopkRouter
75
+
76
+ source_model_id = "zai-org/GLM-4.5V"
77
+ save_folder = "/tmp/tiny-random/glm-4.5v"
78
+ processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
79
+ processor.save_pretrained(save_folder)
80
+
81
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
82
+ config_json = json.load(f)
83
+ config_json['text_config'].update({
84
+ "hidden_size": 32,
85
+ "head_dim": 32,
86
+ "intermediate_size": 128,
87
+ "first_k_dense_replace": 1,
88
+ "moe_intermediate_size": 64,
89
+ "num_attention_heads": 2,
90
+ "num_key_value_heads": 1,
91
+ "num_hidden_layers": 2, # one dense, one moe
92
+ "tie_word_embeddings": True,
93
+ })
94
+ config_json['text_config']['rope_scaling']['mrope_section'] = [2, 2, 4]
95
+ config_json['vision_config']['hidden_size'] = 64
96
+ config_json['vision_config']['depth'] = 2
97
+ config_json['vision_config']['num_heads'] = 2
98
+ config_json['vision_config']['intermediate_size'] = 128
99
+ config_json['vision_config']['out_hidden_size'] = config_json['text_config']['hidden_size']
100
+
101
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
102
+ json.dump(config_json, f, indent=2)
103
+
104
+ config = AutoConfig.from_pretrained(
105
+ save_folder,
106
+ trust_remote_code=True,
107
+ )
108
+ print(config)
109
+ torch.set_default_dtype(torch.bfloat16)
110
+ model = Glm4vMoeForConditionalGeneration(config)
111
+ torch.set_default_dtype(torch.float32)
112
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
113
+ model.generation_config = GenerationConfig.from_pretrained(
114
+ source_model_id, trust_remote_code=True,
115
+ )
116
+ set_seed(42)
117
+ model = model.cpu() # cpu is more stable for random initialization across machines
118
+ num_params = sum(p.numel() for p in model.parameters())
119
+ with torch.no_grad():
120
+ for name, p in sorted(model.named_parameters()):
121
+ torch.nn.init.normal_(p, 0, 0.1)
122
+ print(name, p.shape, p.dtype, p.device, f'{p.numel() / num_params * 100: .2f}%')
123
+ for _, m in sorted(model.named_modules()):
124
+ if isinstance(m, Glm4vMoeTextTopkRouter):
125
+ assert 'e_score_correction_bias' in m.state_dict()
126
+ torch.nn.init.normal_(m.e_score_correction_bias, 0, 1)
127
+ model.save_pretrained(save_folder)
128
+ print(model)
129
+ ```
130
+
131
+ ### Printing the model:
132
+
133
+ ```text
134
+ Glm4vMoeForConditionalGeneration(
135
+ (model): Glm4vMoeModel(
136
+ (visual): Glm4vMoeVisionModel(
137
+ (embeddings): Glm4vMoeVisionEmbeddings(
138
+ (position_embedding): Embedding(576, 64)
139
+ )
140
+ (patch_embed): Glm4vMoeVisionPatchEmbed(
141
+ (proj): Conv3d(3, 64, kernel_size=(2, 14, 14), stride=(2, 14, 14))
142
+ )
143
+ (rotary_pos_emb): Glm4vMoeVisionRotaryEmbedding()
144
+ (blocks): ModuleList(
145
+ (0-1): 2 x Glm4vMoeVisionBlock(
146
+ (norm1): Glm4vMoeRMSNorm((64,), eps=1e-05)
147
+ (norm2): Glm4vMoeRMSNorm((64,), eps=1e-05)
148
+ (attn): Glm4vMoeVisionAttention(
149
+ (qkv): Linear(in_features=64, out_features=192, bias=False)
150
+ (proj): Linear(in_features=64, out_features=64, bias=False)
151
+ )
152
+ (mlp): Glm4vMoeisionMlp(
153
+ (gate_proj): Linear(in_features=64, out_features=32, bias=False)
154
+ (up_proj): Linear(in_features=64, out_features=32, bias=False)
155
+ (down_proj): Linear(in_features=32, out_features=64, bias=False)
156
+ (act_fn): SiLU()
157
+ )
158
+ )
159
+ )
160
+ (merger): Glm4vMoeVisionPatchMerger(
161
+ (proj): Linear(in_features=32, out_features=32, bias=False)
162
+ (post_projection_norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
163
+ (gate_proj): Linear(in_features=32, out_features=128, bias=False)
164
+ (up_proj): Linear(in_features=32, out_features=128, bias=False)
165
+ (down_proj): Linear(in_features=128, out_features=32, bias=False)
166
+ (act1): GELU(approximate='none')
167
+ (act_fn): SiLU()
168
+ )
169
+ (post_conv_layernorm): Glm4vMoeRMSNorm((64,), eps=1e-05)
170
+ (downsample): Conv2d(64, 32, kernel_size=(2, 2), stride=(2, 2))
171
+ (post_layernorm): Glm4vMoeRMSNorm((64,), eps=1e-05)
172
+ )
173
+ (language_model): Glm4vMoeTextModel(
174
+ (embed_tokens): Embedding(151552, 32, padding_idx=151329)
175
+ (layers): ModuleList(
176
+ (0): Glm4vMoeTextDecoderLayer(
177
+ (self_attn): Glm4vMoeTextAttention(
178
+ (q_proj): Linear(in_features=32, out_features=64, bias=True)
179
+ (k_proj): Linear(in_features=32, out_features=32, bias=True)
180
+ (v_proj): Linear(in_features=32, out_features=32, bias=True)
181
+ (o_proj): Linear(in_features=64, out_features=32, bias=False)
182
+ )
183
+ (mlp): Glm4vMoeTextMLP(
184
+ (gate_proj): Linear(in_features=32, out_features=128, bias=False)
185
+ (up_proj): Linear(in_features=32, out_features=128, bias=False)
186
+ (down_proj): Linear(in_features=128, out_features=32, bias=False)
187
+ (act_fn): SiLU()
188
+ )
189
+ (input_layernorm): Glm4vMoeTextRMSNorm((32,), eps=1e-05)
190
+ (post_attention_layernorm): Glm4vMoeTextRMSNorm((32,), eps=1e-05)
191
+ )
192
+ (1): Glm4vMoeTextDecoderLayer(
193
+ (self_attn): Glm4vMoeTextAttention(
194
+ (q_proj): Linear(in_features=32, out_features=64, bias=True)
195
+ (k_proj): Linear(in_features=32, out_features=32, bias=True)
196
+ (v_proj): Linear(in_features=32, out_features=32, bias=True)
197
+ (o_proj): Linear(in_features=64, out_features=32, bias=False)
198
+ )
199
+ (mlp): Glm4vMoeTextMoE(
200
+ (experts): ModuleList(
201
+ (0-127): 128 x Glm4vMoeTextMLP(
202
+ (gate_proj): Linear(in_features=32, out_features=64, bias=False)
203
+ (up_proj): Linear(in_features=32, out_features=64, bias=False)
204
+ (down_proj): Linear(in_features=64, out_features=32, bias=False)
205
+ (act_fn): SiLU()
206
+ )
207
+ )
208
+ (gate): Glm4vMoeTextTopkRouter()
209
+ (shared_experts): Glm4vMoeTextMLP(
210
+ (gate_proj): Linear(in_features=32, out_features=64, bias=False)
211
+ (up_proj): Linear(in_features=32, out_features=64, bias=False)
212
+ (down_proj): Linear(in_features=64, out_features=32, bias=False)
213
+ (act_fn): SiLU()
214
+ )
215
+ )
216
+ (input_layernorm): Glm4vMoeTextRMSNorm((32,), eps=1e-05)
217
+ (post_attention_layernorm): Glm4vMoeTextRMSNorm((32,), eps=1e-05)
218
+ )
219
+ )
220
+ (norm): Glm4vMoeRMSNorm((32,), eps=1e-05)
221
+ (rotary_emb): Glm4vMoeTextRotaryEmbedding()
222
+ )
223
+ )
224
+ (lm_head): Linear(in_features=32, out_features=151552, bias=False)
225
+ )
226
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}
17
+ <arg_key>{arg-key-1}</arg_key>
18
+ <arg_value>{arg-value-1}</arg_value>
19
+ <arg_key>{arg-key-2}</arg_key>
20
+ <arg_value>{arg-value-2}</arg_value>
21
+ ...
22
+ </tool_call>{%- endif -%}
23
+ {%- macro visible_text(content) -%}
24
+ {%- if content is string -%}
25
+ {{- content }}
26
+ {%- elif content is iterable and content is not mapping -%}
27
+ {%- for item in content -%}
28
+ {%- if item is mapping and item.type == 'text' -%}
29
+ {{- item.text }}
30
+ {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%}
31
+ <|begin_of_image|><|image|><|end_of_image|>
32
+ {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%}
33
+ <|begin_of_video|><|video|><|end_of_video|>
34
+ {%- elif item is string -%}
35
+ {{- item }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{- content }}
40
+ {%- endif -%}
41
+ {%- endmacro -%}
42
+ {%- set ns = namespace(last_user_index=-1) %}
43
+ {%- for m in messages %}
44
+ {%- if m.role == 'user' %}
45
+ {% set ns.last_user_index = loop.index0 -%}
46
+ {%- endif %}
47
+ {%- endfor %}
48
+ {% for m in messages %}
49
+ {%- if m.role == 'user' -%}<|user|>
50
+ {% if m.content is string %}
51
+ {{ m.content }}
52
+ {%- else %}
53
+ {%- for item in m.content %}
54
+ {% if item.type == 'video' or 'video' in item %}
55
+ <|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %}
56
+ <|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}
57
+ {{ item.text }}
58
+ {%- endif %}
59
+ {%- endfor %}
60
+ {%- endif %}
61
+ {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
62
+ {%- elif m.role == 'assistant' -%}
63
+ <|assistant|>
64
+ {%- set reasoning_content = '' %}
65
+ {%- set content = visible_text(m.content) %}
66
+ {%- if m.reasoning_content is string %}
67
+ {%- set reasoning_content = m.reasoning_content %}
68
+ {%- else %}
69
+ {%- if '</think>' in content %}
70
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
71
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
72
+ {%- endif %}
73
+ {%- endif %}
74
+ {%- if loop.index0 > ns.last_user_index and reasoning_content -%}
75
+ {{ '\n<think>' + reasoning_content.strip() + '</think>'}}
76
+ {%- else -%}
77
+ {{ '\n<think></think>' }}
78
+ {%- endif -%}
79
+ {%- if content.strip() -%}
80
+ {{ '\n' + content.strip() }}
81
+ {%- endif -%}
82
+ {% if m.tool_calls %}
83
+ {% for tc in m.tool_calls %}
84
+ {%- if tc.function %}
85
+ {%- set tc = tc.function %}
86
+ {%- endif %}
87
+ {{ '\n<tool_call>' + tc.name }}
88
+ {% set _args = tc.arguments %}
89
+ {% for k, v in _args.items() %}
90
+ <arg_key>{{ k }}</arg_key>
91
+ <arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
92
+ {% endfor %}
93
+ </tool_call>{% endfor %}
94
+ {% endif %}
95
+ {%- elif m.role == 'tool' -%}
96
+ {%- if m.content is string -%}
97
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
98
+ {{- '<|observation|>' }}
99
+ {%- endif %}
100
+ {{- '\n<tool_response>\n' }}
101
+ {{- m.content }}
102
+ {{- '\n</tool_response>' }}
103
+ {%- else -%}
104
+ <|observation|>{% for tr in m.content %}
105
+
106
+ <tool_response>
107
+ {{ tr.output if tr.output is defined else tr }}
108
+ </tool_response>{% endfor -%}
109
+ {% endif -%}
110
+ {%- elif m.role == 'system' -%}
111
+ <|system|>
112
+ {{ visible_text(m.content) }}
113
+ {%- endif -%}
114
+ {%- endfor -%}
115
+ {%- if add_generation_prompt -%}
116
+ <|assistant|>
117
+ {{'<think></think>\n' if (enable_thinking is defined and not enable_thinking) else ''}}
118
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4vMoeForConditionalGeneration"
4
+ ],
5
+ "image_end_token_id": 151340,
6
+ "image_start_token_id": 151339,
7
+ "image_token_id": 151363,
8
+ "model_type": "glm4v_moe",
9
+ "text_config": {
10
+ "attention_bias": true,
11
+ "attention_dropout": 0.0,
12
+ "eos_token_id": [
13
+ 151329,
14
+ 151336,
15
+ 151338
16
+ ],
17
+ "first_k_dense_replace": 1,
18
+ "head_dim": 32,
19
+ "hidden_act": "silu",
20
+ "hidden_size": 32,
21
+ "image_end_token_id": 151340,
22
+ "image_start_token_id": 151339,
23
+ "image_token_id": 151363,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 128,
26
+ "max_position_embeddings": 65536,
27
+ "model_type": "Glm4vMoe_text",
28
+ "moe_intermediate_size": 64,
29
+ "n_group": 1,
30
+ "n_routed_experts": 128,
31
+ "n_shared_experts": 1,
32
+ "norm_topk_prob": true,
33
+ "num_attention_heads": 2,
34
+ "num_experts_per_tok": 8,
35
+ "num_hidden_layers": 2,
36
+ "num_key_value_heads": 1,
37
+ "pad_token_id": 151329,
38
+ "partial_rotary_factor": 0.5,
39
+ "rms_norm_eps": 1e-05,
40
+ "rope_scaling": {
41
+ "mrope_section": [
42
+ 2,
43
+ 2,
44
+ 4
45
+ ],
46
+ "rope_type": "default"
47
+ },
48
+ "rope_theta": 10000.0,
49
+ "routed_scaling_factor": 1.0,
50
+ "tie_word_embeddings": true,
51
+ "topk_group": 1,
52
+ "torch_dtype": "bfloat16",
53
+ "use_cache": true,
54
+ "use_qk_norm": false,
55
+ "vocab_size": 151552
56
+ },
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.56.0.dev0",
59
+ "video_end_token_id": 151342,
60
+ "video_start_token_id": 151341,
61
+ "video_token_id": 151364,
62
+ "vision_config": {
63
+ "attention_bias": false,
64
+ "attention_dropout": 0.0,
65
+ "depth": 2,
66
+ "hidden_act": "silu",
67
+ "hidden_size": 64,
68
+ "image_size": 336,
69
+ "in_channels": 3,
70
+ "initializer_range": 0.02,
71
+ "intermediate_size": 128,
72
+ "model_type": "glm4v_moe",
73
+ "num_heads": 2,
74
+ "out_hidden_size": 32,
75
+ "patch_size": 14,
76
+ "rms_norm_eps": 1e-05,
77
+ "spatial_merge_size": 2,
78
+ "temporal_patch_size": 2
79
+ }
80
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151329,
6
+ 151336,
7
+ 151338
8
+ ],
9
+ "pad_token_id": 151329,
10
+ "top_k": 1,
11
+ "top_p": 0.0001,
12
+ "transformers_version": "4.56.0.dev0"
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d07100340fa8163b239f0015b72c09c3849037fff302d4eab84e8cd3cdec0a3
3
+ size 11758344
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Glm4vImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "merge_size": 2,
18
+ "patch_size": 14,
19
+ "processor_class": "Glm4vProcessor",
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 9633792,
24
+ "shortest_edge": 12544
25
+ },
26
+ "temporal_patch_size": 2
27
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>",
17
+ "<|begin_of_audio|>",
18
+ "<|end_of_audio|>",
19
+ "<|begin_of_transcription|>",
20
+ "<|end_of_transcription|>",
21
+ "<|code_prefix|>",
22
+ "<|code_middle|>",
23
+ "<|code_suffix|>",
24
+ "/nothink"
25
+ ],
26
+ "eos_token": {
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "pad_token": {
34
+ "content": "<|endoftext|>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ }
40
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda8e2146c3bb7b7e0fc96dcc4f0aeff041c6c27952e3ace0665663ebff346ba
3
+ size 19970700
tokenizer_config.json ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151329": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151330": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151331": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151332": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151333": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151334": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151335": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151336": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151337": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151338": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151339": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151340": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151341": {
100
+ "content": "<|begin_of_video|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151342": {
108
+ "content": "<|end_of_video|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "151343": {
116
+ "content": "<|begin_of_audio|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "151344": {
124
+ "content": "<|end_of_audio|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "151345": {
132
+ "content": "<|begin_of_transcription|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "151346": {
140
+ "content": "<|end_of_transcription|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "151347": {
148
+ "content": "<|code_prefix|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "151348": {
156
+ "content": "<|code_middle|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "151349": {
164
+ "content": "<|code_suffix|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "151350": {
172
+ "content": "<think>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "151351": {
180
+ "content": "</think>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "151352": {
188
+ "content": "<tool_call>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "151353": {
196
+ "content": "</tool_call>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "151354": {
204
+ "content": "<tool_response>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "151355": {
212
+ "content": "</tool_response>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "151356": {
220
+ "content": "<arg_key>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "151357": {
228
+ "content": "</arg_key>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "151358": {
236
+ "content": "<arg_value>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "151359": {
244
+ "content": "</arg_value>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "151360": {
252
+ "content": "/nothink",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "151361": {
260
+ "content": "<|begin_of_box|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "151362": {
268
+ "content": "<|end_of_box|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "151363": {
276
+ "content": "<|image|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": false
282
+ },
283
+ "151364": {
284
+ "content": "<|video|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": false
290
+ }
291
+ },
292
+ "additional_special_tokens": [
293
+ "<|endoftext|>",
294
+ "[MASK]",
295
+ "[gMASK]",
296
+ "[sMASK]",
297
+ "<sop>",
298
+ "<eop>",
299
+ "<|system|>",
300
+ "<|user|>",
301
+ "<|assistant|>",
302
+ "<|observation|>",
303
+ "<|begin_of_image|>",
304
+ "<|end_of_image|>",
305
+ "<|begin_of_video|>",
306
+ "<|end_of_video|>",
307
+ "<|begin_of_audio|>",
308
+ "<|end_of_audio|>",
309
+ "<|begin_of_transcription|>",
310
+ "<|end_of_transcription|>",
311
+ "<|code_prefix|>",
312
+ "<|code_middle|>",
313
+ "<|code_suffix|>",
314
+ "/nothink"
315
+ ],
316
+ "clean_up_tokenization_spaces": false,
317
+ "do_lower_case": false,
318
+ "eos_token": "<|endoftext|>",
319
+ "extra_special_tokens": {},
320
+ "model_max_length": 128000,
321
+ "pad_token": "<|endoftext|>",
322
+ "padding_side": "left",
323
+ "processor_class": "Glm4vProcessor",
324
+ "remove_space": false,
325
+ "tokenizer_class": "PreTrainedTokenizerFast"
326
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": null,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "do_sample_frames": true,
13
+ "fps": 2,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "input_data_format": null,
25
+ "max_image_size": {
26
+ "longest_edge": 47040000
27
+ },
28
+ "merge_size": 2,
29
+ "num_frames": 16,
30
+ "patch_size": 14,
31
+ "processor_class": "Glm4vProcessor",
32
+ "resample": 3,
33
+ "rescale_factor": 0.00392156862745098,
34
+ "size": {
35
+ "longest_edge": 47040000,
36
+ "shortest_edge": 12544
37
+ },
38
+ "size_divisor": null,
39
+ "temporal_patch_size": 2,
40
+ "video_metadata": null,
41
+ "video_processor_type": "Glm4vVideoProcessor"
42
+ }