haijunlv commited on
Commit
22acd83
·
verified ·
1 Parent(s): 483826d

upload model

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% set default_thinking_sys %}You are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.{% endset %}
2
+ {%- set tool_instruction %}Your response should consist of a reasoning step (**thought**) followed immediately by a function call in valid JSON format. Wrap each function call using the `<|action_start|><|plugin|>` and `<|action_end|>` tags.
3
+
4
+ **Format example:**
5
+
6
+ ```
7
+ (Your thought goes here...)
8
+
9
+ <|action_start|><|plugin|>
10
+ {
11
+ "name": "tool_name",
12
+ "parameters": {
13
+ "parameter1": "value1",
14
+ "parameter2": "value2"
15
+ }
16
+ }
17
+ <|action_end|>
18
+ ```
19
+
20
+ # External Tools
21
+ You have access to these tools:
22
+ {% if tools %}{{ tools | tojson(indent=2) }}{% else %}[]{% endif %}{% endset %}
23
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
24
+ {%- for message in messages[::-1] %}
25
+ {%- set index = (messages|length - 1) - loop.index0 %}
26
+ {%- if ns.multi_step_tool and message.role == "user" %}
27
+ {%- set ns.multi_step_tool = false %}
28
+ {%- set ns.last_query_index = index %}
29
+ {%- endif %}
30
+ {%- endfor %}
31
+ {%- for message in messages %}
32
+ {%- set role = message.role if message.role != 'tool' else 'environment' %}
33
+ {%- set reasoning_content = '' %}
34
+ {%- set content = message.content %}
35
+ {%- set ns.tool_calls = '' %}
36
+ {%- if role == 'assistant' %}
37
+ {%- if message.reasoning_content is string %}
38
+ {%- set reasoning_content = message.reasoning_content %}
39
+ {%- elif '</think>' in content %}
40
+ {%- set reasoning_content = content.split('</think>')[0].strip().split('<think>')[-1].strip() %}
41
+ {%- set content = content.split('</think>')[-1].lstrip('
42
+ ') %}
43
+ {%- endif %}
44
+ {%- if message.tool_calls %}
45
+ {%- for tool_call in message.tool_calls %}
46
+ {%- if not loop.first %}
47
+ {%- set ns.tool_calls = ns.tool_calls + '
48
+ ' %}
49
+ {%- endif %}
50
+ {%- if tool_call.function %}
51
+ {%- set tool_call = tool_call.function %}
52
+ {%- endif %}
53
+ {%- set ns.tool_calls = ns.tool_calls + '<|action_start|><|plugin|>
54
+ {"name": "' + tool_call.name + '", "parameters": ' %}
55
+ {%- if tool_call.arguments is string %}
56
+ {%- set ns.tool_calls = ns.tool_calls + tool_call.arguments %}
57
+ {%- else %}
58
+ {%- set ns.tool_calls = ns.tool_calls + tool_call.arguments | tojson %}
59
+ {%- endif %}
60
+ {%- set ns.tool_calls = ns.tool_calls + '}
61
+ <|action_end|>' %}
62
+ {%- endfor %}
63
+ {%- endif %}
64
+ {%- set reasoning_content = '<think>
65
+ ' + reasoning_content.strip('
66
+ ') + '
67
+ </think>
68
+ ' %}
69
+ {%- endif %}
70
+ {%- if not content is string %}
71
+ {%- set ns.content = '' %}
72
+ {%- for _content in message.content %}
73
+ {%- if _content.type == 'image' %}
74
+ {%- set ns.content = ns.content ~ '
75
+ <IMG_CONTEXT>' %}
76
+ {%- elif _content.type == 'video' %}
77
+ {%- set ns.content = ns.content ~ '
78
+ <video>' %}
79
+ {%- elif _content.type == 'text' %}
80
+ {%- set ns.content = ns.content ~ '
81
+ ' ~ _content.text %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- set content = ns.content %}
85
+ {%- endif %}
86
+ {%- set content = content.lstrip('
87
+ ') %}
88
+ {%- if ns.tool_calls %}
89
+ {%- set content = content + ns.tool_calls %}
90
+ {%- endif %}
91
+ {%- if loop.index0 == 0 %}
92
+ {%- set system_prompt = '' %}
93
+ {%- if role == 'system' %}
94
+ {%- set system_prompt = system_prompt + content %}
95
+ {%- elif enable_thinking is not defined or enable_thinking %}
96
+ {%- set system_prompt = system_prompt + default_thinking_sys %}
97
+ {%- endif %}
98
+ {%- if tools %}
99
+ {%- set system_prompt = system_prompt.rstrip('
100
+ ') + '
101
+
102
+ ' + tool_instruction %}
103
+ {%- endif %}
104
+ {%- set system_prompt = system_prompt.strip('
105
+ ') %}
106
+ {%- endif %}
107
+ {%- if loop.index0 == 0 and system_prompt %}<|im_start|>system{% if tools %} name=<|plugin|>{% endif %}
108
+
109
+ {{ system_prompt }}<|im_end|>
110
+ {% endif %}
111
+ {%- if role != 'system' %}<|im_start|>{{ role }}{% if role == 'environment' or role == 'tool' %} name=<|plugin|>{% endif %}
112
+
113
+ {% if loop.index0 > ns.last_query_index and (loop.last or (not loop.last and reasoning_content)) %}{{ reasoning_content }}
114
+ {%- endif %}{{ content }}<|im_end|>
115
+ {% endif %}
116
+ {%- endfor %}
117
+ {%- if add_generation_prompt %}<|im_start|>assistant
118
+ {% if enable_thinking is not defined or enable_thinking %}
119
+ <think>{% endif %}
120
+ {% endif %}
config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "InternS1ForConditionalGeneration"
4
+ ],
5
+ "downsample_ratio": 0.5,
6
+ "image_seq_length": 256,
7
+ "image_token_id": 152957,
8
+ "model_type": "interns1",
9
+ "projector_hidden_act": "gelu",
10
+ "text_config": {
11
+ "_attn_implementation_autoset": true,
12
+ "architectures": [
13
+ "Qwen3ForCausalLM"
14
+ ],
15
+ "attention_bias": false,
16
+ "attention_dropout": 0.0,
17
+ "bos_token_id": 151643,
18
+ "eos_token_id": 151645,
19
+ "head_dim": 128,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 4096,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 12288,
24
+ "max_position_embeddings": 40960,
25
+ "max_window_layers": 36,
26
+ "model_type": "qwen3",
27
+ "num_attention_heads": 32,
28
+ "num_hidden_layers": 36,
29
+ "num_key_value_heads": 8,
30
+ "rms_norm_eps": 1e-06,
31
+ "rope_scaling": null,
32
+ "rope_theta": 1000000,
33
+ "sliding_window": null,
34
+ "torch_dtype": "bfloat16",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "vocab_size": 153216
38
+ },
39
+ "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.53.0",
41
+ "vision_config": {
42
+ "_attn_implementation_autoset": true,
43
+ "architectures": [
44
+ "InternVisionModel"
45
+ ],
46
+ "attention_bias": true,
47
+ "attention_dropout": 0.0,
48
+ "auto_map": {
49
+ "AutoConfig": "configuration_interns1.InternS1VisionConfig",
50
+ "AutoModel": "modeling_interns1.InternS1VisionModel"
51
+ },
52
+ "drop_path_rate": 0.0,
53
+ "dropout": 0.0,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout_prob": 0.0,
56
+ "hidden_size": 1024,
57
+ "image_size": [
58
+ 448,
59
+ 448
60
+ ],
61
+ "initializer_factor": 1.0,
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 4096,
64
+ "layer_norm_eps": 1e-06,
65
+ "layer_scale_init_value": 0.1,
66
+ "model_type": "interns1_vision",
67
+ "norm_type": "layer_norm",
68
+ "num_attention_heads": 16,
69
+ "num_channels": 3,
70
+ "num_hidden_layers": 24,
71
+ "patch_size": [
72
+ 14,
73
+ 14
74
+ ],
75
+ "projection_dropout": 0.0,
76
+ "torch_dtype": "bfloat16",
77
+ "use_absolute_position_embeddings": true,
78
+ "use_mask_token": false,
79
+ "use_mean_pooling": true,
80
+ "use_qk_norm": false
81
+ },
82
+ "vision_feature_layer": -1,
83
+ "vision_feature_select_strategy": "default",
84
+ "auto_map": {
85
+ "AutoConfig": "configuration_interns1.InternS1Config",
86
+ "AutoModel": "modeling_interns1.InternS1Model",
87
+ "AutoModelForCausalLM": "modeling_interns1.InternS1ForConditionalGeneration"
88
+ }
89
+ }
configuration_interns1.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers import AutoConfig
19
+
20
+
21
+ class InternS1VisionConfig(PretrainedConfig):
22
+ r"""
23
+ This is the configuration class to store the configuration of a [`InternS1VisionModel`]. It is used to instantiate an InternS1VisionModel
24
+ model according to the specified arguments, defining the model architecture.
25
+
26
+ Args:
27
+ hidden_size (`int`, *optional*, defaults to 1024):
28
+ Dimensionality of the encoder layers and the pooler layer.
29
+ num_hidden_layers (`int`, *optional*, defaults to 24):
30
+ Number of hidden layers in the Transformer encoder.
31
+ num_attention_heads (`int`, *optional*, defaults to 16):
32
+ Number of attention heads for each attention layer in the Transformer encoder.
33
+ attention_bias (`bool`, *optional*, defaults to `False`):
34
+ Whether to add a bias to the queries, keys and values.
35
+ use_qk_norm (`bool`, *optional*, defaults to `False`):
36
+ Whether to apply normalization to the queries and keys before the attention operation.
37
+ intermediate_size (`int`, *optional*, defaults to 4096):
38
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
39
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
40
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
41
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
42
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
43
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
44
+ attention_dropout (`float`, *optional*, defaults to 0.0):
45
+ Dropout probability for attention weights.
46
+ projection_dropout (`float`, *optional*, defaults to 0.0):
47
+ Dropout probability for the projection layer.
48
+ initializer_range (`float`, *optional*, defaults to 0.02):
49
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
50
+ norm_type (`str`, *optional*, defaults to `"layer_norm"`):
51
+ The type of normalization to use in the encoder. Can be `"layer_norm"` or `"rms_norm"`.
52
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
53
+ The epsilon used by the layer normalization layers.
54
+ image_size (`int` or `list[int]`, *optional*, defaults to `[448, 448]`):
55
+ The size (resolution) of each image.
56
+ patch_size (`int` or `list[int]`, *optional*, defaults to `[14, 14]`):
57
+ The size (resolution) of each patch.
58
+ num_channels (`int`, *optional*, defaults to 3):
59
+ The number of input channels.
60
+ use_mask_token (`bool`, *optional*, defaults to `False`):
61
+ Whether to use a mask token for masked image modeling.
62
+ use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
63
+ Whether to use BERT-style absolute position embeddings.
64
+ layer_scale_init_value (`float`, *optional*, defaults to 0.1):
65
+ Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
66
+ use_mean_pooling (`bool`, *optional*, defaults to `True`):
67
+ Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
68
+ CLS token, before applying the classification head.
69
+
70
+ Example:
71
+
72
+ ```python
73
+ >>> from transformers import InternS1VisionConfig, InternS1VisionModel
74
+
75
+ >>> # Initializing a InternS1VisionModel
76
+ >>> configuration = InternS1VisionConfig()
77
+
78
+ >>> # Initializing a model (with random weights) from configuration
79
+ >>> model = InternS1VisionModel(configuration)
80
+
81
+ >>> # Accessing the model configuration
82
+ >>> configuration = model.config
83
+ ```"""
84
+
85
+ model_type = "interns1_vision"
86
+ base_config_key = "vision_config"
87
+
88
+ def __init__(
89
+ self,
90
+ hidden_size=1024,
91
+ num_hidden_layers=24,
92
+ num_attention_heads=16,
93
+ attention_bias=False,
94
+ use_qk_norm=False,
95
+ intermediate_size=4096,
96
+ hidden_act="gelu",
97
+ hidden_dropout_prob=0.0,
98
+ attention_dropout=0.0,
99
+ projection_dropout=0.0,
100
+ drop_path_rate=0.0,
101
+ initializer_range=0.02,
102
+ norm_type="layer_norm",
103
+ layer_norm_eps=1e-06,
104
+ image_size=[448, 448],
105
+ patch_size=[14, 14],
106
+ num_channels=3,
107
+ use_mask_token=False,
108
+ use_absolute_position_embeddings=True,
109
+ layer_scale_init_value=0.1,
110
+ use_mean_pooling=True,
111
+ **kwargs,
112
+ ):
113
+ super().__init__(**kwargs)
114
+
115
+ self.hidden_size = hidden_size
116
+ self.num_hidden_layers = num_hidden_layers
117
+ self.num_attention_heads = num_attention_heads
118
+ self.attention_bias = attention_bias
119
+ self.use_qk_norm = use_qk_norm
120
+ self.intermediate_size = intermediate_size
121
+ self.hidden_act = hidden_act
122
+ self.hidden_dropout_prob = hidden_dropout_prob
123
+ self.attention_dropout = attention_dropout
124
+ self.projection_dropout = projection_dropout
125
+ self.initializer_range = initializer_range
126
+ self.norm_type = norm_type
127
+ self.layer_norm_eps = layer_norm_eps
128
+ self.drop_path_rate = drop_path_rate
129
+
130
+ image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size)
131
+ patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size)
132
+ self.image_size = image_size
133
+ self.patch_size = patch_size
134
+
135
+ self.num_channels = num_channels
136
+ self.use_mask_token = use_mask_token
137
+ self.use_absolute_position_embeddings = use_absolute_position_embeddings
138
+ self.layer_scale_init_value = layer_scale_init_value
139
+ self.use_mean_pooling = use_mean_pooling
140
+
141
+
142
+ class InternS1Config(PretrainedConfig):
143
+ r"""
144
+ This is the configuration class to store the configuration of a [`InternS1ForConditionalGeneration`]. It is used to instantiate a
145
+ InternS1 model according to the specified arguments, defining the model architecture.
146
+
147
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
148
+ documentation from [`PretrainedConfig`] for more information.
149
+
150
+
151
+ Args:
152
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `InternVisonConfig`):
153
+ The config object or dictionary of the vision backbone.
154
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
155
+ The config object or dictionary of the text backbone.
156
+ image_token_id (`int`, *optional*, defaults to 151667):
157
+ The image token index to encode the image prompt.
158
+ image_seq_length (`int`, *optional*, defaults to 256):
159
+ Number of image tokens to use per image patch.
160
+ downsample_ratio (`float`, *optional*, defaults to 0.5):
161
+ Factor by which to downsample the image.
162
+ projector_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
163
+ The non-linear activation function (function or string) in the projector.
164
+ vision_feature_layer (`int`, *optional*, defaults to -1):
165
+ The index of the layer to use as the image features.
166
+ vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
167
+ The feature selection strategy used to select the vision feature from the vision backbone.
168
+ Can be one of `"default"` or `"full"`.
169
+
170
+ ```python
171
+ >>> from transformers import InternS1ForConditionalGeneration, InternS1Config
172
+
173
+ >>> # Initializing a InternS1 style configuration
174
+ >>> configuration = InternS1Config()
175
+
176
+ >>> # Initializing a model (with random weights) from configuration
177
+ >>> model = InternS1ForConditionalGeneration(configuration)
178
+
179
+ >>> # Accessing the model configuration
180
+ >>> configuration = model.config
181
+ ```"""
182
+
183
+ model_type = "interns1"
184
+ sub_configs = {"text_config": AutoConfig, "vision_config": InternS1VisionConfig}
185
+
186
+ def __init__(
187
+ self,
188
+ vision_config=None,
189
+ text_config=None,
190
+ image_token_id=151667,
191
+ image_seq_length=256,
192
+ downsample_ratio=0.5,
193
+ projector_hidden_act="gelu",
194
+ vision_feature_layer=-1,
195
+ vision_feature_select_strategy="default",
196
+ **kwargs,
197
+ ):
198
+ from transformers import CONFIG_MAPPING
199
+
200
+ self.image_token_id = image_token_id
201
+ self.image_seq_length = image_seq_length
202
+ self.downsample_ratio = downsample_ratio
203
+ self.projector_hidden_act = projector_hidden_act
204
+ self.vision_feature_layer = vision_feature_layer
205
+ self.vision_feature_select_strategy = vision_feature_select_strategy
206
+
207
+ if isinstance(vision_config, dict):
208
+ self.vision_config = InternS1VisionConfig(**vision_config)
209
+ elif isinstance(vision_config, InternS1VisionConfig):
210
+ self.vision_config = vision_config
211
+ elif vision_config is None:
212
+ self.vision_config = InternS1VisionConfig()
213
+
214
+ if isinstance(text_config, dict):
215
+ text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen3"
216
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
217
+ elif text_config is None:
218
+ text_config = CONFIG_MAPPING["qwen3"]()
219
+
220
+ self.text_config = text_config
221
+
222
+ super().__init__(**kwargs)
223
+
224
+
225
+ __all__ = ["InternS1VisionConfig", "InternS1Config"]
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "4.53.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae0c9b37f256e0f3d636d89c0f3b1f15d5accd900935aa246b1edb26bf114c8b
3
+ size 4916843808
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:493e945a856cdd4d8ee40d90aff4144ab417d9d75484d8414ddd779c9f5351c6
3
+ size 4915962480
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e23506f9d41df413781d363027506c2342a83386b84efbc9ef577f8109a8ebf3
3
+ size 4915962496
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19162a9cf951ab491f9294f858a3a070eff107e1d3f6354464ad37914088f889
3
+ size 2328949432
model.safetensors.index.json ADDED
@@ -0,0 +1,849 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 8538804224,
4
+ "total_size": 17077608448
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
15
+ "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
18
+ "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
26
+ "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
29
+ "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.language_model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
32
+ "model.language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "model.language_model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
37
+ "model.language_model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.language_model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.language_model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
40
+ "model.language_model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.language_model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.language_model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
43
+ "model.language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
47
+ "model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
48
+ "model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
51
+ "model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
54
+ "model.language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
59
+ "model.language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
62
+ "model.language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.language_model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
65
+ "model.language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
70
+ "model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
73
+ "model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
76
+ "model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
80
+ "model.language_model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
81
+ "model.language_model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.language_model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.language_model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
84
+ "model.language_model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.language_model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
87
+ "model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
91
+ "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
92
+ "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
95
+ "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
102
+ "model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
103
+ "model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
106
+ "model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
113
+ "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
114
+ "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
117
+ "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
120
+ "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
124
+ "model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
125
+ "model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
128
+ "model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
131
+ "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
135
+ "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
136
+ "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
139
+ "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
142
+ "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
143
+ "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
144
+ "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
145
+ "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
146
+ "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
147
+ "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
148
+ "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
149
+ "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
150
+ "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
151
+ "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
152
+ "model.language_model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "model.language_model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
155
+ "model.language_model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.language_model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
157
+ "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
158
+ "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
160
+ "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
161
+ "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
163
+ "model.language_model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
164
+ "model.language_model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.language_model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.language_model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.language_model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
168
+ "model.language_model.layers.21.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
169
+ "model.language_model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.language_model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.language_model.layers.21.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
172
+ "model.language_model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.language_model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.language_model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "model.language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.language_model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
179
+ "model.language_model.layers.22.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
180
+ "model.language_model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.language_model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.language_model.layers.22.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
183
+ "model.language_model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.language_model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.language_model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
186
+ "model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
191
+ "model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
194
+ "model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
197
+ "model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
202
+ "model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
205
+ "model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
208
+ "model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
212
+ "model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
213
+ "model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
216
+ "model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
218
+ "model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
219
+ "model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
223
+ "model.language_model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
224
+ "model.language_model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.language_model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.language_model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
227
+ "model.language_model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.language_model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
234
+ "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
235
+ "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
238
+ "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
245
+ "model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
246
+ "model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
249
+ "model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
252
+ "model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
256
+ "model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
257
+ "model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
260
+ "model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
263
+ "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
267
+ "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
268
+ "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
271
+ "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
274
+ "model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
278
+ "model.language_model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
279
+ "model.language_model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.language_model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.language_model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
282
+ "model.language_model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.language_model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
284
+ "model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
285
+ "model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
288
+ "model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
289
+ "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
290
+ "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
292
+ "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
293
+ "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
295
+ "model.language_model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
296
+ "model.language_model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.language_model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.language_model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
299
+ "model.language_model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
300
+ "model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
301
+ "model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
303
+ "model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
304
+ "model.language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.language_model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
306
+ "model.language_model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
307
+ "model.language_model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
308
+ "model.language_model.layers.33.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
309
+ "model.language_model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
310
+ "model.language_model.layers.33.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
311
+ "model.language_model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
312
+ "model.language_model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.language_model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
314
+ "model.language_model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
315
+ "model.language_model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.language_model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
317
+ "model.language_model.layers.34.input_layernorm.weight": "model-00004-of-00004.safetensors",
318
+ "model.language_model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
319
+ "model.language_model.layers.34.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
320
+ "model.language_model.layers.34.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
321
+ "model.language_model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
322
+ "model.language_model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
323
+ "model.language_model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
324
+ "model.language_model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
325
+ "model.language_model.layers.34.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
326
+ "model.language_model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
327
+ "model.language_model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
328
+ "model.language_model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
329
+ "model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
330
+ "model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
331
+ "model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
332
+ "model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
333
+ "model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
334
+ "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
335
+ "model.language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
336
+ "model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
337
+ "model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
338
+ "model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
339
+ "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
340
+ "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
341
+ "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
342
+ "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
343
+ "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
344
+ "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
345
+ "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
346
+ "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
347
+ "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
348
+ "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
349
+ "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
350
+ "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
351
+ "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
352
+ "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
353
+ "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
354
+ "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
355
+ "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
356
+ "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
357
+ "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
358
+ "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
359
+ "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
360
+ "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
361
+ "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
362
+ "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
363
+ "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
364
+ "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
365
+ "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
366
+ "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
367
+ "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
368
+ "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
369
+ "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
370
+ "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
371
+ "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
372
+ "model.language_model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
373
+ "model.language_model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
374
+ "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
375
+ "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
376
+ "model.language_model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
377
+ "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
378
+ "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
379
+ "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
380
+ "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
381
+ "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
382
+ "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
383
+ "model.language_model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
384
+ "model.language_model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
385
+ "model.language_model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
386
+ "model.language_model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
387
+ "model.language_model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
388
+ "model.language_model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
389
+ "model.language_model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
390
+ "model.language_model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
391
+ "model.language_model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
392
+ "model.language_model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
393
+ "model.language_model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
394
+ "model.language_model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
395
+ "model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
396
+ "model.language_model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
397
+ "model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
398
+ "model.language_model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
399
+ "model.language_model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
400
+ "model.language_model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
401
+ "model.language_model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
402
+ "model.language_model.layers.9.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
403
+ "model.language_model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
404
+ "model.language_model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
405
+ "model.language_model.norm.weight": "model-00004-of-00004.safetensors",
406
+ "model.multi_modal_projector.layer_norm.bias": "model-00001-of-00004.safetensors",
407
+ "model.multi_modal_projector.layer_norm.weight": "model-00001-of-00004.safetensors",
408
+ "model.multi_modal_projector.linear_1.bias": "model-00001-of-00004.safetensors",
409
+ "model.multi_modal_projector.linear_1.weight": "model-00001-of-00004.safetensors",
410
+ "model.multi_modal_projector.linear_2.bias": "model-00001-of-00004.safetensors",
411
+ "model.multi_modal_projector.linear_2.weight": "model-00001-of-00004.safetensors",
412
+ "model.vision_tower.embeddings.cls_token": "model-00001-of-00004.safetensors",
413
+ "model.vision_tower.embeddings.patch_embeddings.projection.bias": "model-00001-of-00004.safetensors",
414
+ "model.vision_tower.embeddings.patch_embeddings.projection.weight": "model-00001-of-00004.safetensors",
415
+ "model.vision_tower.embeddings.position_embeddings": "model-00001-of-00004.safetensors",
416
+ "model.vision_tower.encoder.layer.0.attention.k_proj.bias": "model-00001-of-00004.safetensors",
417
+ "model.vision_tower.encoder.layer.0.attention.k_proj.weight": "model-00001-of-00004.safetensors",
418
+ "model.vision_tower.encoder.layer.0.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
419
+ "model.vision_tower.encoder.layer.0.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
420
+ "model.vision_tower.encoder.layer.0.attention.q_proj.bias": "model-00001-of-00004.safetensors",
421
+ "model.vision_tower.encoder.layer.0.attention.q_proj.weight": "model-00001-of-00004.safetensors",
422
+ "model.vision_tower.encoder.layer.0.attention.v_proj.bias": "model-00001-of-00004.safetensors",
423
+ "model.vision_tower.encoder.layer.0.attention.v_proj.weight": "model-00001-of-00004.safetensors",
424
+ "model.vision_tower.encoder.layer.0.lambda_1": "model-00001-of-00004.safetensors",
425
+ "model.vision_tower.encoder.layer.0.lambda_2": "model-00001-of-00004.safetensors",
426
+ "model.vision_tower.encoder.layer.0.layernorm_after.bias": "model-00001-of-00004.safetensors",
427
+ "model.vision_tower.encoder.layer.0.layernorm_after.weight": "model-00001-of-00004.safetensors",
428
+ "model.vision_tower.encoder.layer.0.layernorm_before.bias": "model-00001-of-00004.safetensors",
429
+ "model.vision_tower.encoder.layer.0.layernorm_before.weight": "model-00001-of-00004.safetensors",
430
+ "model.vision_tower.encoder.layer.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
431
+ "model.vision_tower.encoder.layer.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
432
+ "model.vision_tower.encoder.layer.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
433
+ "model.vision_tower.encoder.layer.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
434
+ "model.vision_tower.encoder.layer.1.attention.k_proj.bias": "model-00001-of-00004.safetensors",
435
+ "model.vision_tower.encoder.layer.1.attention.k_proj.weight": "model-00001-of-00004.safetensors",
436
+ "model.vision_tower.encoder.layer.1.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
437
+ "model.vision_tower.encoder.layer.1.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
438
+ "model.vision_tower.encoder.layer.1.attention.q_proj.bias": "model-00001-of-00004.safetensors",
439
+ "model.vision_tower.encoder.layer.1.attention.q_proj.weight": "model-00001-of-00004.safetensors",
440
+ "model.vision_tower.encoder.layer.1.attention.v_proj.bias": "model-00001-of-00004.safetensors",
441
+ "model.vision_tower.encoder.layer.1.attention.v_proj.weight": "model-00001-of-00004.safetensors",
442
+ "model.vision_tower.encoder.layer.1.lambda_1": "model-00001-of-00004.safetensors",
443
+ "model.vision_tower.encoder.layer.1.lambda_2": "model-00001-of-00004.safetensors",
444
+ "model.vision_tower.encoder.layer.1.layernorm_after.bias": "model-00001-of-00004.safetensors",
445
+ "model.vision_tower.encoder.layer.1.layernorm_after.weight": "model-00001-of-00004.safetensors",
446
+ "model.vision_tower.encoder.layer.1.layernorm_before.bias": "model-00001-of-00004.safetensors",
447
+ "model.vision_tower.encoder.layer.1.layernorm_before.weight": "model-00001-of-00004.safetensors",
448
+ "model.vision_tower.encoder.layer.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
449
+ "model.vision_tower.encoder.layer.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
450
+ "model.vision_tower.encoder.layer.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
451
+ "model.vision_tower.encoder.layer.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
452
+ "model.vision_tower.encoder.layer.10.attention.k_proj.bias": "model-00001-of-00004.safetensors",
453
+ "model.vision_tower.encoder.layer.10.attention.k_proj.weight": "model-00001-of-00004.safetensors",
454
+ "model.vision_tower.encoder.layer.10.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
455
+ "model.vision_tower.encoder.layer.10.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
456
+ "model.vision_tower.encoder.layer.10.attention.q_proj.bias": "model-00001-of-00004.safetensors",
457
+ "model.vision_tower.encoder.layer.10.attention.q_proj.weight": "model-00001-of-00004.safetensors",
458
+ "model.vision_tower.encoder.layer.10.attention.v_proj.bias": "model-00001-of-00004.safetensors",
459
+ "model.vision_tower.encoder.layer.10.attention.v_proj.weight": "model-00001-of-00004.safetensors",
460
+ "model.vision_tower.encoder.layer.10.lambda_1": "model-00001-of-00004.safetensors",
461
+ "model.vision_tower.encoder.layer.10.lambda_2": "model-00001-of-00004.safetensors",
462
+ "model.vision_tower.encoder.layer.10.layernorm_after.bias": "model-00001-of-00004.safetensors",
463
+ "model.vision_tower.encoder.layer.10.layernorm_after.weight": "model-00001-of-00004.safetensors",
464
+ "model.vision_tower.encoder.layer.10.layernorm_before.bias": "model-00001-of-00004.safetensors",
465
+ "model.vision_tower.encoder.layer.10.layernorm_before.weight": "model-00001-of-00004.safetensors",
466
+ "model.vision_tower.encoder.layer.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
467
+ "model.vision_tower.encoder.layer.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
468
+ "model.vision_tower.encoder.layer.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
469
+ "model.vision_tower.encoder.layer.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
470
+ "model.vision_tower.encoder.layer.11.attention.k_proj.bias": "model-00001-of-00004.safetensors",
471
+ "model.vision_tower.encoder.layer.11.attention.k_proj.weight": "model-00001-of-00004.safetensors",
472
+ "model.vision_tower.encoder.layer.11.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
473
+ "model.vision_tower.encoder.layer.11.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
474
+ "model.vision_tower.encoder.layer.11.attention.q_proj.bias": "model-00001-of-00004.safetensors",
475
+ "model.vision_tower.encoder.layer.11.attention.q_proj.weight": "model-00001-of-00004.safetensors",
476
+ "model.vision_tower.encoder.layer.11.attention.v_proj.bias": "model-00001-of-00004.safetensors",
477
+ "model.vision_tower.encoder.layer.11.attention.v_proj.weight": "model-00001-of-00004.safetensors",
478
+ "model.vision_tower.encoder.layer.11.lambda_1": "model-00001-of-00004.safetensors",
479
+ "model.vision_tower.encoder.layer.11.lambda_2": "model-00001-of-00004.safetensors",
480
+ "model.vision_tower.encoder.layer.11.layernorm_after.bias": "model-00001-of-00004.safetensors",
481
+ "model.vision_tower.encoder.layer.11.layernorm_after.weight": "model-00001-of-00004.safetensors",
482
+ "model.vision_tower.encoder.layer.11.layernorm_before.bias": "model-00001-of-00004.safetensors",
483
+ "model.vision_tower.encoder.layer.11.layernorm_before.weight": "model-00001-of-00004.safetensors",
484
+ "model.vision_tower.encoder.layer.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
485
+ "model.vision_tower.encoder.layer.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
486
+ "model.vision_tower.encoder.layer.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
487
+ "model.vision_tower.encoder.layer.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
488
+ "model.vision_tower.encoder.layer.12.attention.k_proj.bias": "model-00001-of-00004.safetensors",
489
+ "model.vision_tower.encoder.layer.12.attention.k_proj.weight": "model-00001-of-00004.safetensors",
490
+ "model.vision_tower.encoder.layer.12.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
491
+ "model.vision_tower.encoder.layer.12.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
492
+ "model.vision_tower.encoder.layer.12.attention.q_proj.bias": "model-00001-of-00004.safetensors",
493
+ "model.vision_tower.encoder.layer.12.attention.q_proj.weight": "model-00001-of-00004.safetensors",
494
+ "model.vision_tower.encoder.layer.12.attention.v_proj.bias": "model-00001-of-00004.safetensors",
495
+ "model.vision_tower.encoder.layer.12.attention.v_proj.weight": "model-00001-of-00004.safetensors",
496
+ "model.vision_tower.encoder.layer.12.lambda_1": "model-00001-of-00004.safetensors",
497
+ "model.vision_tower.encoder.layer.12.lambda_2": "model-00001-of-00004.safetensors",
498
+ "model.vision_tower.encoder.layer.12.layernorm_after.bias": "model-00001-of-00004.safetensors",
499
+ "model.vision_tower.encoder.layer.12.layernorm_after.weight": "model-00001-of-00004.safetensors",
500
+ "model.vision_tower.encoder.layer.12.layernorm_before.bias": "model-00001-of-00004.safetensors",
501
+ "model.vision_tower.encoder.layer.12.layernorm_before.weight": "model-00001-of-00004.safetensors",
502
+ "model.vision_tower.encoder.layer.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
503
+ "model.vision_tower.encoder.layer.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
504
+ "model.vision_tower.encoder.layer.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
505
+ "model.vision_tower.encoder.layer.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
506
+ "model.vision_tower.encoder.layer.13.attention.k_proj.bias": "model-00001-of-00004.safetensors",
507
+ "model.vision_tower.encoder.layer.13.attention.k_proj.weight": "model-00001-of-00004.safetensors",
508
+ "model.vision_tower.encoder.layer.13.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
509
+ "model.vision_tower.encoder.layer.13.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
510
+ "model.vision_tower.encoder.layer.13.attention.q_proj.bias": "model-00001-of-00004.safetensors",
511
+ "model.vision_tower.encoder.layer.13.attention.q_proj.weight": "model-00001-of-00004.safetensors",
512
+ "model.vision_tower.encoder.layer.13.attention.v_proj.bias": "model-00001-of-00004.safetensors",
513
+ "model.vision_tower.encoder.layer.13.attention.v_proj.weight": "model-00001-of-00004.safetensors",
514
+ "model.vision_tower.encoder.layer.13.lambda_1": "model-00001-of-00004.safetensors",
515
+ "model.vision_tower.encoder.layer.13.lambda_2": "model-00001-of-00004.safetensors",
516
+ "model.vision_tower.encoder.layer.13.layernorm_after.bias": "model-00001-of-00004.safetensors",
517
+ "model.vision_tower.encoder.layer.13.layernorm_after.weight": "model-00001-of-00004.safetensors",
518
+ "model.vision_tower.encoder.layer.13.layernorm_before.bias": "model-00001-of-00004.safetensors",
519
+ "model.vision_tower.encoder.layer.13.layernorm_before.weight": "model-00001-of-00004.safetensors",
520
+ "model.vision_tower.encoder.layer.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
521
+ "model.vision_tower.encoder.layer.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
522
+ "model.vision_tower.encoder.layer.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
523
+ "model.vision_tower.encoder.layer.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
524
+ "model.vision_tower.encoder.layer.14.attention.k_proj.bias": "model-00001-of-00004.safetensors",
525
+ "model.vision_tower.encoder.layer.14.attention.k_proj.weight": "model-00001-of-00004.safetensors",
526
+ "model.vision_tower.encoder.layer.14.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
527
+ "model.vision_tower.encoder.layer.14.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
528
+ "model.vision_tower.encoder.layer.14.attention.q_proj.bias": "model-00001-of-00004.safetensors",
529
+ "model.vision_tower.encoder.layer.14.attention.q_proj.weight": "model-00001-of-00004.safetensors",
530
+ "model.vision_tower.encoder.layer.14.attention.v_proj.bias": "model-00001-of-00004.safetensors",
531
+ "model.vision_tower.encoder.layer.14.attention.v_proj.weight": "model-00001-of-00004.safetensors",
532
+ "model.vision_tower.encoder.layer.14.lambda_1": "model-00001-of-00004.safetensors",
533
+ "model.vision_tower.encoder.layer.14.lambda_2": "model-00001-of-00004.safetensors",
534
+ "model.vision_tower.encoder.layer.14.layernorm_after.bias": "model-00001-of-00004.safetensors",
535
+ "model.vision_tower.encoder.layer.14.layernorm_after.weight": "model-00001-of-00004.safetensors",
536
+ "model.vision_tower.encoder.layer.14.layernorm_before.bias": "model-00001-of-00004.safetensors",
537
+ "model.vision_tower.encoder.layer.14.layernorm_before.weight": "model-00001-of-00004.safetensors",
538
+ "model.vision_tower.encoder.layer.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
539
+ "model.vision_tower.encoder.layer.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
540
+ "model.vision_tower.encoder.layer.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
541
+ "model.vision_tower.encoder.layer.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
542
+ "model.vision_tower.encoder.layer.15.attention.k_proj.bias": "model-00001-of-00004.safetensors",
543
+ "model.vision_tower.encoder.layer.15.attention.k_proj.weight": "model-00001-of-00004.safetensors",
544
+ "model.vision_tower.encoder.layer.15.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
545
+ "model.vision_tower.encoder.layer.15.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
546
+ "model.vision_tower.encoder.layer.15.attention.q_proj.bias": "model-00001-of-00004.safetensors",
547
+ "model.vision_tower.encoder.layer.15.attention.q_proj.weight": "model-00001-of-00004.safetensors",
548
+ "model.vision_tower.encoder.layer.15.attention.v_proj.bias": "model-00001-of-00004.safetensors",
549
+ "model.vision_tower.encoder.layer.15.attention.v_proj.weight": "model-00001-of-00004.safetensors",
550
+ "model.vision_tower.encoder.layer.15.lambda_1": "model-00001-of-00004.safetensors",
551
+ "model.vision_tower.encoder.layer.15.lambda_2": "model-00001-of-00004.safetensors",
552
+ "model.vision_tower.encoder.layer.15.layernorm_after.bias": "model-00001-of-00004.safetensors",
553
+ "model.vision_tower.encoder.layer.15.layernorm_after.weight": "model-00001-of-00004.safetensors",
554
+ "model.vision_tower.encoder.layer.15.layernorm_before.bias": "model-00001-of-00004.safetensors",
555
+ "model.vision_tower.encoder.layer.15.layernorm_before.weight": "model-00001-of-00004.safetensors",
556
+ "model.vision_tower.encoder.layer.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
557
+ "model.vision_tower.encoder.layer.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
558
+ "model.vision_tower.encoder.layer.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
559
+ "model.vision_tower.encoder.layer.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
560
+ "model.vision_tower.encoder.layer.16.attention.k_proj.bias": "model-00001-of-00004.safetensors",
561
+ "model.vision_tower.encoder.layer.16.attention.k_proj.weight": "model-00001-of-00004.safetensors",
562
+ "model.vision_tower.encoder.layer.16.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
563
+ "model.vision_tower.encoder.layer.16.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
564
+ "model.vision_tower.encoder.layer.16.attention.q_proj.bias": "model-00001-of-00004.safetensors",
565
+ "model.vision_tower.encoder.layer.16.attention.q_proj.weight": "model-00001-of-00004.safetensors",
566
+ "model.vision_tower.encoder.layer.16.attention.v_proj.bias": "model-00001-of-00004.safetensors",
567
+ "model.vision_tower.encoder.layer.16.attention.v_proj.weight": "model-00001-of-00004.safetensors",
568
+ "model.vision_tower.encoder.layer.16.lambda_1": "model-00001-of-00004.safetensors",
569
+ "model.vision_tower.encoder.layer.16.lambda_2": "model-00001-of-00004.safetensors",
570
+ "model.vision_tower.encoder.layer.16.layernorm_after.bias": "model-00001-of-00004.safetensors",
571
+ "model.vision_tower.encoder.layer.16.layernorm_after.weight": "model-00001-of-00004.safetensors",
572
+ "model.vision_tower.encoder.layer.16.layernorm_before.bias": "model-00001-of-00004.safetensors",
573
+ "model.vision_tower.encoder.layer.16.layernorm_before.weight": "model-00001-of-00004.safetensors",
574
+ "model.vision_tower.encoder.layer.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
575
+ "model.vision_tower.encoder.layer.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
576
+ "model.vision_tower.encoder.layer.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
577
+ "model.vision_tower.encoder.layer.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
578
+ "model.vision_tower.encoder.layer.17.attention.k_proj.bias": "model-00001-of-00004.safetensors",
579
+ "model.vision_tower.encoder.layer.17.attention.k_proj.weight": "model-00001-of-00004.safetensors",
580
+ "model.vision_tower.encoder.layer.17.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
581
+ "model.vision_tower.encoder.layer.17.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
582
+ "model.vision_tower.encoder.layer.17.attention.q_proj.bias": "model-00001-of-00004.safetensors",
583
+ "model.vision_tower.encoder.layer.17.attention.q_proj.weight": "model-00001-of-00004.safetensors",
584
+ "model.vision_tower.encoder.layer.17.attention.v_proj.bias": "model-00001-of-00004.safetensors",
585
+ "model.vision_tower.encoder.layer.17.attention.v_proj.weight": "model-00001-of-00004.safetensors",
586
+ "model.vision_tower.encoder.layer.17.lambda_1": "model-00001-of-00004.safetensors",
587
+ "model.vision_tower.encoder.layer.17.lambda_2": "model-00001-of-00004.safetensors",
588
+ "model.vision_tower.encoder.layer.17.layernorm_after.bias": "model-00001-of-00004.safetensors",
589
+ "model.vision_tower.encoder.layer.17.layernorm_after.weight": "model-00001-of-00004.safetensors",
590
+ "model.vision_tower.encoder.layer.17.layernorm_before.bias": "model-00001-of-00004.safetensors",
591
+ "model.vision_tower.encoder.layer.17.layernorm_before.weight": "model-00001-of-00004.safetensors",
592
+ "model.vision_tower.encoder.layer.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
593
+ "model.vision_tower.encoder.layer.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
594
+ "model.vision_tower.encoder.layer.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
595
+ "model.vision_tower.encoder.layer.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
596
+ "model.vision_tower.encoder.layer.18.attention.k_proj.bias": "model-00001-of-00004.safetensors",
597
+ "model.vision_tower.encoder.layer.18.attention.k_proj.weight": "model-00001-of-00004.safetensors",
598
+ "model.vision_tower.encoder.layer.18.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
599
+ "model.vision_tower.encoder.layer.18.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
600
+ "model.vision_tower.encoder.layer.18.attention.q_proj.bias": "model-00001-of-00004.safetensors",
601
+ "model.vision_tower.encoder.layer.18.attention.q_proj.weight": "model-00001-of-00004.safetensors",
602
+ "model.vision_tower.encoder.layer.18.attention.v_proj.bias": "model-00001-of-00004.safetensors",
603
+ "model.vision_tower.encoder.layer.18.attention.v_proj.weight": "model-00001-of-00004.safetensors",
604
+ "model.vision_tower.encoder.layer.18.lambda_1": "model-00001-of-00004.safetensors",
605
+ "model.vision_tower.encoder.layer.18.lambda_2": "model-00001-of-00004.safetensors",
606
+ "model.vision_tower.encoder.layer.18.layernorm_after.bias": "model-00001-of-00004.safetensors",
607
+ "model.vision_tower.encoder.layer.18.layernorm_after.weight": "model-00001-of-00004.safetensors",
608
+ "model.vision_tower.encoder.layer.18.layernorm_before.bias": "model-00001-of-00004.safetensors",
609
+ "model.vision_tower.encoder.layer.18.layernorm_before.weight": "model-00001-of-00004.safetensors",
610
+ "model.vision_tower.encoder.layer.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
611
+ "model.vision_tower.encoder.layer.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
612
+ "model.vision_tower.encoder.layer.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
613
+ "model.vision_tower.encoder.layer.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
614
+ "model.vision_tower.encoder.layer.19.attention.k_proj.bias": "model-00001-of-00004.safetensors",
615
+ "model.vision_tower.encoder.layer.19.attention.k_proj.weight": "model-00001-of-00004.safetensors",
616
+ "model.vision_tower.encoder.layer.19.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
617
+ "model.vision_tower.encoder.layer.19.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
618
+ "model.vision_tower.encoder.layer.19.attention.q_proj.bias": "model-00001-of-00004.safetensors",
619
+ "model.vision_tower.encoder.layer.19.attention.q_proj.weight": "model-00001-of-00004.safetensors",
620
+ "model.vision_tower.encoder.layer.19.attention.v_proj.bias": "model-00001-of-00004.safetensors",
621
+ "model.vision_tower.encoder.layer.19.attention.v_proj.weight": "model-00001-of-00004.safetensors",
622
+ "model.vision_tower.encoder.layer.19.lambda_1": "model-00001-of-00004.safetensors",
623
+ "model.vision_tower.encoder.layer.19.lambda_2": "model-00001-of-00004.safetensors",
624
+ "model.vision_tower.encoder.layer.19.layernorm_after.bias": "model-00001-of-00004.safetensors",
625
+ "model.vision_tower.encoder.layer.19.layernorm_after.weight": "model-00001-of-00004.safetensors",
626
+ "model.vision_tower.encoder.layer.19.layernorm_before.bias": "model-00001-of-00004.safetensors",
627
+ "model.vision_tower.encoder.layer.19.layernorm_before.weight": "model-00001-of-00004.safetensors",
628
+ "model.vision_tower.encoder.layer.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
629
+ "model.vision_tower.encoder.layer.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
630
+ "model.vision_tower.encoder.layer.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
631
+ "model.vision_tower.encoder.layer.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
632
+ "model.vision_tower.encoder.layer.2.attention.k_proj.bias": "model-00001-of-00004.safetensors",
633
+ "model.vision_tower.encoder.layer.2.attention.k_proj.weight": "model-00001-of-00004.safetensors",
634
+ "model.vision_tower.encoder.layer.2.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
635
+ "model.vision_tower.encoder.layer.2.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
636
+ "model.vision_tower.encoder.layer.2.attention.q_proj.bias": "model-00001-of-00004.safetensors",
637
+ "model.vision_tower.encoder.layer.2.attention.q_proj.weight": "model-00001-of-00004.safetensors",
638
+ "model.vision_tower.encoder.layer.2.attention.v_proj.bias": "model-00001-of-00004.safetensors",
639
+ "model.vision_tower.encoder.layer.2.attention.v_proj.weight": "model-00001-of-00004.safetensors",
640
+ "model.vision_tower.encoder.layer.2.lambda_1": "model-00001-of-00004.safetensors",
641
+ "model.vision_tower.encoder.layer.2.lambda_2": "model-00001-of-00004.safetensors",
642
+ "model.vision_tower.encoder.layer.2.layernorm_after.bias": "model-00001-of-00004.safetensors",
643
+ "model.vision_tower.encoder.layer.2.layernorm_after.weight": "model-00001-of-00004.safetensors",
644
+ "model.vision_tower.encoder.layer.2.layernorm_before.bias": "model-00001-of-00004.safetensors",
645
+ "model.vision_tower.encoder.layer.2.layernorm_before.weight": "model-00001-of-00004.safetensors",
646
+ "model.vision_tower.encoder.layer.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
647
+ "model.vision_tower.encoder.layer.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
648
+ "model.vision_tower.encoder.layer.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
649
+ "model.vision_tower.encoder.layer.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
650
+ "model.vision_tower.encoder.layer.20.attention.k_proj.bias": "model-00001-of-00004.safetensors",
651
+ "model.vision_tower.encoder.layer.20.attention.k_proj.weight": "model-00001-of-00004.safetensors",
652
+ "model.vision_tower.encoder.layer.20.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
653
+ "model.vision_tower.encoder.layer.20.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
654
+ "model.vision_tower.encoder.layer.20.attention.q_proj.bias": "model-00001-of-00004.safetensors",
655
+ "model.vision_tower.encoder.layer.20.attention.q_proj.weight": "model-00001-of-00004.safetensors",
656
+ "model.vision_tower.encoder.layer.20.attention.v_proj.bias": "model-00001-of-00004.safetensors",
657
+ "model.vision_tower.encoder.layer.20.attention.v_proj.weight": "model-00001-of-00004.safetensors",
658
+ "model.vision_tower.encoder.layer.20.lambda_1": "model-00001-of-00004.safetensors",
659
+ "model.vision_tower.encoder.layer.20.lambda_2": "model-00001-of-00004.safetensors",
660
+ "model.vision_tower.encoder.layer.20.layernorm_after.bias": "model-00001-of-00004.safetensors",
661
+ "model.vision_tower.encoder.layer.20.layernorm_after.weight": "model-00001-of-00004.safetensors",
662
+ "model.vision_tower.encoder.layer.20.layernorm_before.bias": "model-00001-of-00004.safetensors",
663
+ "model.vision_tower.encoder.layer.20.layernorm_before.weight": "model-00001-of-00004.safetensors",
664
+ "model.vision_tower.encoder.layer.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
665
+ "model.vision_tower.encoder.layer.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
666
+ "model.vision_tower.encoder.layer.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
667
+ "model.vision_tower.encoder.layer.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
668
+ "model.vision_tower.encoder.layer.21.attention.k_proj.bias": "model-00001-of-00004.safetensors",
669
+ "model.vision_tower.encoder.layer.21.attention.k_proj.weight": "model-00001-of-00004.safetensors",
670
+ "model.vision_tower.encoder.layer.21.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
671
+ "model.vision_tower.encoder.layer.21.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
672
+ "model.vision_tower.encoder.layer.21.attention.q_proj.bias": "model-00001-of-00004.safetensors",
673
+ "model.vision_tower.encoder.layer.21.attention.q_proj.weight": "model-00001-of-00004.safetensors",
674
+ "model.vision_tower.encoder.layer.21.attention.v_proj.bias": "model-00001-of-00004.safetensors",
675
+ "model.vision_tower.encoder.layer.21.attention.v_proj.weight": "model-00001-of-00004.safetensors",
676
+ "model.vision_tower.encoder.layer.21.lambda_1": "model-00001-of-00004.safetensors",
677
+ "model.vision_tower.encoder.layer.21.lambda_2": "model-00001-of-00004.safetensors",
678
+ "model.vision_tower.encoder.layer.21.layernorm_after.bias": "model-00001-of-00004.safetensors",
679
+ "model.vision_tower.encoder.layer.21.layernorm_after.weight": "model-00001-of-00004.safetensors",
680
+ "model.vision_tower.encoder.layer.21.layernorm_before.bias": "model-00001-of-00004.safetensors",
681
+ "model.vision_tower.encoder.layer.21.layernorm_before.weight": "model-00001-of-00004.safetensors",
682
+ "model.vision_tower.encoder.layer.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
683
+ "model.vision_tower.encoder.layer.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
684
+ "model.vision_tower.encoder.layer.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
685
+ "model.vision_tower.encoder.layer.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
686
+ "model.vision_tower.encoder.layer.22.attention.k_proj.bias": "model-00001-of-00004.safetensors",
687
+ "model.vision_tower.encoder.layer.22.attention.k_proj.weight": "model-00001-of-00004.safetensors",
688
+ "model.vision_tower.encoder.layer.22.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
689
+ "model.vision_tower.encoder.layer.22.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
690
+ "model.vision_tower.encoder.layer.22.attention.q_proj.bias": "model-00001-of-00004.safetensors",
691
+ "model.vision_tower.encoder.layer.22.attention.q_proj.weight": "model-00001-of-00004.safetensors",
692
+ "model.vision_tower.encoder.layer.22.attention.v_proj.bias": "model-00001-of-00004.safetensors",
693
+ "model.vision_tower.encoder.layer.22.attention.v_proj.weight": "model-00001-of-00004.safetensors",
694
+ "model.vision_tower.encoder.layer.22.lambda_1": "model-00001-of-00004.safetensors",
695
+ "model.vision_tower.encoder.layer.22.lambda_2": "model-00001-of-00004.safetensors",
696
+ "model.vision_tower.encoder.layer.22.layernorm_after.bias": "model-00001-of-00004.safetensors",
697
+ "model.vision_tower.encoder.layer.22.layernorm_after.weight": "model-00001-of-00004.safetensors",
698
+ "model.vision_tower.encoder.layer.22.layernorm_before.bias": "model-00001-of-00004.safetensors",
699
+ "model.vision_tower.encoder.layer.22.layernorm_before.weight": "model-00001-of-00004.safetensors",
700
+ "model.vision_tower.encoder.layer.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
701
+ "model.vision_tower.encoder.layer.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
702
+ "model.vision_tower.encoder.layer.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
703
+ "model.vision_tower.encoder.layer.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
704
+ "model.vision_tower.encoder.layer.23.attention.k_proj.bias": "model-00001-of-00004.safetensors",
705
+ "model.vision_tower.encoder.layer.23.attention.k_proj.weight": "model-00001-of-00004.safetensors",
706
+ "model.vision_tower.encoder.layer.23.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
707
+ "model.vision_tower.encoder.layer.23.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
708
+ "model.vision_tower.encoder.layer.23.attention.q_proj.bias": "model-00001-of-00004.safetensors",
709
+ "model.vision_tower.encoder.layer.23.attention.q_proj.weight": "model-00001-of-00004.safetensors",
710
+ "model.vision_tower.encoder.layer.23.attention.v_proj.bias": "model-00001-of-00004.safetensors",
711
+ "model.vision_tower.encoder.layer.23.attention.v_proj.weight": "model-00001-of-00004.safetensors",
712
+ "model.vision_tower.encoder.layer.23.lambda_1": "model-00001-of-00004.safetensors",
713
+ "model.vision_tower.encoder.layer.23.lambda_2": "model-00001-of-00004.safetensors",
714
+ "model.vision_tower.encoder.layer.23.layernorm_after.bias": "model-00001-of-00004.safetensors",
715
+ "model.vision_tower.encoder.layer.23.layernorm_after.weight": "model-00001-of-00004.safetensors",
716
+ "model.vision_tower.encoder.layer.23.layernorm_before.bias": "model-00001-of-00004.safetensors",
717
+ "model.vision_tower.encoder.layer.23.layernorm_before.weight": "model-00001-of-00004.safetensors",
718
+ "model.vision_tower.encoder.layer.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
719
+ "model.vision_tower.encoder.layer.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
720
+ "model.vision_tower.encoder.layer.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
721
+ "model.vision_tower.encoder.layer.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
722
+ "model.vision_tower.encoder.layer.3.attention.k_proj.bias": "model-00001-of-00004.safetensors",
723
+ "model.vision_tower.encoder.layer.3.attention.k_proj.weight": "model-00001-of-00004.safetensors",
724
+ "model.vision_tower.encoder.layer.3.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
725
+ "model.vision_tower.encoder.layer.3.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
726
+ "model.vision_tower.encoder.layer.3.attention.q_proj.bias": "model-00001-of-00004.safetensors",
727
+ "model.vision_tower.encoder.layer.3.attention.q_proj.weight": "model-00001-of-00004.safetensors",
728
+ "model.vision_tower.encoder.layer.3.attention.v_proj.bias": "model-00001-of-00004.safetensors",
729
+ "model.vision_tower.encoder.layer.3.attention.v_proj.weight": "model-00001-of-00004.safetensors",
730
+ "model.vision_tower.encoder.layer.3.lambda_1": "model-00001-of-00004.safetensors",
731
+ "model.vision_tower.encoder.layer.3.lambda_2": "model-00001-of-00004.safetensors",
732
+ "model.vision_tower.encoder.layer.3.layernorm_after.bias": "model-00001-of-00004.safetensors",
733
+ "model.vision_tower.encoder.layer.3.layernorm_after.weight": "model-00001-of-00004.safetensors",
734
+ "model.vision_tower.encoder.layer.3.layernorm_before.bias": "model-00001-of-00004.safetensors",
735
+ "model.vision_tower.encoder.layer.3.layernorm_before.weight": "model-00001-of-00004.safetensors",
736
+ "model.vision_tower.encoder.layer.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
737
+ "model.vision_tower.encoder.layer.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
738
+ "model.vision_tower.encoder.layer.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
739
+ "model.vision_tower.encoder.layer.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
740
+ "model.vision_tower.encoder.layer.4.attention.k_proj.bias": "model-00001-of-00004.safetensors",
741
+ "model.vision_tower.encoder.layer.4.attention.k_proj.weight": "model-00001-of-00004.safetensors",
742
+ "model.vision_tower.encoder.layer.4.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
743
+ "model.vision_tower.encoder.layer.4.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
744
+ "model.vision_tower.encoder.layer.4.attention.q_proj.bias": "model-00001-of-00004.safetensors",
745
+ "model.vision_tower.encoder.layer.4.attention.q_proj.weight": "model-00001-of-00004.safetensors",
746
+ "model.vision_tower.encoder.layer.4.attention.v_proj.bias": "model-00001-of-00004.safetensors",
747
+ "model.vision_tower.encoder.layer.4.attention.v_proj.weight": "model-00001-of-00004.safetensors",
748
+ "model.vision_tower.encoder.layer.4.lambda_1": "model-00001-of-00004.safetensors",
749
+ "model.vision_tower.encoder.layer.4.lambda_2": "model-00001-of-00004.safetensors",
750
+ "model.vision_tower.encoder.layer.4.layernorm_after.bias": "model-00001-of-00004.safetensors",
751
+ "model.vision_tower.encoder.layer.4.layernorm_after.weight": "model-00001-of-00004.safetensors",
752
+ "model.vision_tower.encoder.layer.4.layernorm_before.bias": "model-00001-of-00004.safetensors",
753
+ "model.vision_tower.encoder.layer.4.layernorm_before.weight": "model-00001-of-00004.safetensors",
754
+ "model.vision_tower.encoder.layer.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
755
+ "model.vision_tower.encoder.layer.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
756
+ "model.vision_tower.encoder.layer.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
757
+ "model.vision_tower.encoder.layer.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
758
+ "model.vision_tower.encoder.layer.5.attention.k_proj.bias": "model-00001-of-00004.safetensors",
759
+ "model.vision_tower.encoder.layer.5.attention.k_proj.weight": "model-00001-of-00004.safetensors",
760
+ "model.vision_tower.encoder.layer.5.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
761
+ "model.vision_tower.encoder.layer.5.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
762
+ "model.vision_tower.encoder.layer.5.attention.q_proj.bias": "model-00001-of-00004.safetensors",
763
+ "model.vision_tower.encoder.layer.5.attention.q_proj.weight": "model-00001-of-00004.safetensors",
764
+ "model.vision_tower.encoder.layer.5.attention.v_proj.bias": "model-00001-of-00004.safetensors",
765
+ "model.vision_tower.encoder.layer.5.attention.v_proj.weight": "model-00001-of-00004.safetensors",
766
+ "model.vision_tower.encoder.layer.5.lambda_1": "model-00001-of-00004.safetensors",
767
+ "model.vision_tower.encoder.layer.5.lambda_2": "model-00001-of-00004.safetensors",
768
+ "model.vision_tower.encoder.layer.5.layernorm_after.bias": "model-00001-of-00004.safetensors",
769
+ "model.vision_tower.encoder.layer.5.layernorm_after.weight": "model-00001-of-00004.safetensors",
770
+ "model.vision_tower.encoder.layer.5.layernorm_before.bias": "model-00001-of-00004.safetensors",
771
+ "model.vision_tower.encoder.layer.5.layernorm_before.weight": "model-00001-of-00004.safetensors",
772
+ "model.vision_tower.encoder.layer.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
773
+ "model.vision_tower.encoder.layer.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
774
+ "model.vision_tower.encoder.layer.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
775
+ "model.vision_tower.encoder.layer.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
776
+ "model.vision_tower.encoder.layer.6.attention.k_proj.bias": "model-00001-of-00004.safetensors",
777
+ "model.vision_tower.encoder.layer.6.attention.k_proj.weight": "model-00001-of-00004.safetensors",
778
+ "model.vision_tower.encoder.layer.6.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
779
+ "model.vision_tower.encoder.layer.6.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
780
+ "model.vision_tower.encoder.layer.6.attention.q_proj.bias": "model-00001-of-00004.safetensors",
781
+ "model.vision_tower.encoder.layer.6.attention.q_proj.weight": "model-00001-of-00004.safetensors",
782
+ "model.vision_tower.encoder.layer.6.attention.v_proj.bias": "model-00001-of-00004.safetensors",
783
+ "model.vision_tower.encoder.layer.6.attention.v_proj.weight": "model-00001-of-00004.safetensors",
784
+ "model.vision_tower.encoder.layer.6.lambda_1": "model-00001-of-00004.safetensors",
785
+ "model.vision_tower.encoder.layer.6.lambda_2": "model-00001-of-00004.safetensors",
786
+ "model.vision_tower.encoder.layer.6.layernorm_after.bias": "model-00001-of-00004.safetensors",
787
+ "model.vision_tower.encoder.layer.6.layernorm_after.weight": "model-00001-of-00004.safetensors",
788
+ "model.vision_tower.encoder.layer.6.layernorm_before.bias": "model-00001-of-00004.safetensors",
789
+ "model.vision_tower.encoder.layer.6.layernorm_before.weight": "model-00001-of-00004.safetensors",
790
+ "model.vision_tower.encoder.layer.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
791
+ "model.vision_tower.encoder.layer.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
792
+ "model.vision_tower.encoder.layer.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
793
+ "model.vision_tower.encoder.layer.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
794
+ "model.vision_tower.encoder.layer.7.attention.k_proj.bias": "model-00001-of-00004.safetensors",
795
+ "model.vision_tower.encoder.layer.7.attention.k_proj.weight": "model-00001-of-00004.safetensors",
796
+ "model.vision_tower.encoder.layer.7.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
797
+ "model.vision_tower.encoder.layer.7.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
798
+ "model.vision_tower.encoder.layer.7.attention.q_proj.bias": "model-00001-of-00004.safetensors",
799
+ "model.vision_tower.encoder.layer.7.attention.q_proj.weight": "model-00001-of-00004.safetensors",
800
+ "model.vision_tower.encoder.layer.7.attention.v_proj.bias": "model-00001-of-00004.safetensors",
801
+ "model.vision_tower.encoder.layer.7.attention.v_proj.weight": "model-00001-of-00004.safetensors",
802
+ "model.vision_tower.encoder.layer.7.lambda_1": "model-00001-of-00004.safetensors",
803
+ "model.vision_tower.encoder.layer.7.lambda_2": "model-00001-of-00004.safetensors",
804
+ "model.vision_tower.encoder.layer.7.layernorm_after.bias": "model-00001-of-00004.safetensors",
805
+ "model.vision_tower.encoder.layer.7.layernorm_after.weight": "model-00001-of-00004.safetensors",
806
+ "model.vision_tower.encoder.layer.7.layernorm_before.bias": "model-00001-of-00004.safetensors",
807
+ "model.vision_tower.encoder.layer.7.layernorm_before.weight": "model-00001-of-00004.safetensors",
808
+ "model.vision_tower.encoder.layer.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
809
+ "model.vision_tower.encoder.layer.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
810
+ "model.vision_tower.encoder.layer.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
811
+ "model.vision_tower.encoder.layer.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
812
+ "model.vision_tower.encoder.layer.8.attention.k_proj.bias": "model-00001-of-00004.safetensors",
813
+ "model.vision_tower.encoder.layer.8.attention.k_proj.weight": "model-00001-of-00004.safetensors",
814
+ "model.vision_tower.encoder.layer.8.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
815
+ "model.vision_tower.encoder.layer.8.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
816
+ "model.vision_tower.encoder.layer.8.attention.q_proj.bias": "model-00001-of-00004.safetensors",
817
+ "model.vision_tower.encoder.layer.8.attention.q_proj.weight": "model-00001-of-00004.safetensors",
818
+ "model.vision_tower.encoder.layer.8.attention.v_proj.bias": "model-00001-of-00004.safetensors",
819
+ "model.vision_tower.encoder.layer.8.attention.v_proj.weight": "model-00001-of-00004.safetensors",
820
+ "model.vision_tower.encoder.layer.8.lambda_1": "model-00001-of-00004.safetensors",
821
+ "model.vision_tower.encoder.layer.8.lambda_2": "model-00001-of-00004.safetensors",
822
+ "model.vision_tower.encoder.layer.8.layernorm_after.bias": "model-00001-of-00004.safetensors",
823
+ "model.vision_tower.encoder.layer.8.layernorm_after.weight": "model-00001-of-00004.safetensors",
824
+ "model.vision_tower.encoder.layer.8.layernorm_before.bias": "model-00001-of-00004.safetensors",
825
+ "model.vision_tower.encoder.layer.8.layernorm_before.weight": "model-00001-of-00004.safetensors",
826
+ "model.vision_tower.encoder.layer.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
827
+ "model.vision_tower.encoder.layer.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
828
+ "model.vision_tower.encoder.layer.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
829
+ "model.vision_tower.encoder.layer.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
830
+ "model.vision_tower.encoder.layer.9.attention.k_proj.bias": "model-00001-of-00004.safetensors",
831
+ "model.vision_tower.encoder.layer.9.attention.k_proj.weight": "model-00001-of-00004.safetensors",
832
+ "model.vision_tower.encoder.layer.9.attention.projection_layer.bias": "model-00001-of-00004.safetensors",
833
+ "model.vision_tower.encoder.layer.9.attention.projection_layer.weight": "model-00001-of-00004.safetensors",
834
+ "model.vision_tower.encoder.layer.9.attention.q_proj.bias": "model-00001-of-00004.safetensors",
835
+ "model.vision_tower.encoder.layer.9.attention.q_proj.weight": "model-00001-of-00004.safetensors",
836
+ "model.vision_tower.encoder.layer.9.attention.v_proj.bias": "model-00001-of-00004.safetensors",
837
+ "model.vision_tower.encoder.layer.9.attention.v_proj.weight": "model-00001-of-00004.safetensors",
838
+ "model.vision_tower.encoder.layer.9.lambda_1": "model-00001-of-00004.safetensors",
839
+ "model.vision_tower.encoder.layer.9.lambda_2": "model-00001-of-00004.safetensors",
840
+ "model.vision_tower.encoder.layer.9.layernorm_after.bias": "model-00001-of-00004.safetensors",
841
+ "model.vision_tower.encoder.layer.9.layernorm_after.weight": "model-00001-of-00004.safetensors",
842
+ "model.vision_tower.encoder.layer.9.layernorm_before.bias": "model-00001-of-00004.safetensors",
843
+ "model.vision_tower.encoder.layer.9.layernorm_before.weight": "model-00001-of-00004.safetensors",
844
+ "model.vision_tower.encoder.layer.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
845
+ "model.vision_tower.encoder.layer.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
846
+ "model.vision_tower.encoder.layer.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
847
+ "model.vision_tower.encoder.layer.9.mlp.fc2.weight": "model-00001-of-00004.safetensors"
848
+ }
849
+ }
modeling_interns1.py ADDED
@@ -0,0 +1,1200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/interns1/modular_interns1.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_interns1.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 HuggingFace Inc. team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+
23
+ import collections.abc
24
+ from dataclasses import dataclass
25
+ from typing import Callable, Optional, Union
26
+ import numpy as np
27
+ import torch
28
+ import torch.nn as nn
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache
32
+ from transformers.generation import GenerationMixin
33
+ from transformers.integrations import use_kernel_forward_from_hub
34
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
35
+ from transformers.modeling_layers import GradientCheckpointingLayer
36
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
37
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
38
+ from transformers.processing_utils import Unpack
39
+ from transformers.utils import (
40
+ ModelOutput,
41
+ auto_docstring,
42
+ can_return_tuple,
43
+ is_torchdynamo_compiling,
44
+ torch_int,
45
+ )
46
+ from transformers import AutoModel
47
+ from .configuration_interns1 import InternS1Config, InternS1VisionConfig
48
+
49
+
50
+ @use_kernel_forward_from_hub("RMSNorm")
51
+ class InternS1VisionRMSNorm(nn.Module):
52
+ def __init__(self, hidden_size, eps=1e-6):
53
+ """
54
+ InternS1VisionRMSNorm is equivalent to T5LayerNorm
55
+ """
56
+ super().__init__()
57
+ self.weight = nn.Parameter(torch.ones(hidden_size))
58
+ self.variance_epsilon = eps
59
+
60
+ def forward(self, hidden_states):
61
+ input_dtype = hidden_states.dtype
62
+ hidden_states = hidden_states.to(torch.float32)
63
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
64
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
65
+ return self.weight * hidden_states.to(input_dtype)
66
+
67
+ def extra_repr(self):
68
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
69
+
70
+
71
+ def eager_attention_forward(
72
+ module: nn.Module,
73
+ query: torch.Tensor,
74
+ key: torch.Tensor,
75
+ value: torch.Tensor,
76
+ attention_mask: Optional[torch.Tensor],
77
+ scaling: float,
78
+ dropout: float = 0.0,
79
+ **kwargs,
80
+ ):
81
+ key_states = key
82
+ value_states = value
83
+
84
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
85
+ if attention_mask is not None:
86
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
87
+ attn_weights = attn_weights + causal_mask
88
+
89
+ # No upcasting of the attention weights to float32 in this implementation
90
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
91
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
92
+ attn_output = torch.matmul(attn_weights, value_states)
93
+ attn_output = attn_output.transpose(1, 2).contiguous()
94
+
95
+ return attn_output, attn_weights
96
+
97
+
98
+ class InternS1VisionAttention(nn.Module):
99
+ """Attention Class for InternS1 Vision Encoder"""
100
+
101
+ def __init__(self, config: InternS1VisionConfig):
102
+ super().__init__()
103
+ self.config = config
104
+ self.embed_dim = config.hidden_size
105
+ self.num_heads = config.num_attention_heads
106
+ self.head_dim = self.embed_dim // self.num_heads
107
+ if self.head_dim * self.num_heads != self.embed_dim:
108
+ raise ValueError(
109
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
110
+ f" {self.num_heads})."
111
+ )
112
+ self.scale = self.head_dim ** -0.5
113
+ self.attention_dropout = config.attention_dropout
114
+ proj_dropout = config.projection_dropout
115
+ qk_norm = config.use_qk_norm
116
+
117
+ # Needed for flash attention
118
+ self.is_causal = False
119
+
120
+ self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
121
+ self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
122
+ self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
123
+ self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim)
124
+ self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity()
125
+
126
+ self.q_norm = InternS1VisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
127
+ self.k_norm = InternS1VisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
128
+
129
+ def forward(
130
+ self,
131
+ hidden_states: torch.Tensor,
132
+ attention_mask: Optional[torch.Tensor] = None,
133
+ output_attentions: Optional[torch.Tensor] = None,
134
+ **kwargs: Unpack[FlashAttentionKwargs],
135
+ ):
136
+ batch_size, seq_len, _ = hidden_states.size()
137
+
138
+ query_states = self.q_proj(hidden_states)
139
+ key_states = self.k_proj(hidden_states)
140
+ value_states = self.v_proj(hidden_states)
141
+
142
+ query_states = self.q_norm(query_states)
143
+ key_states = self.k_norm(key_states)
144
+
145
+ query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
146
+ key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
147
+ value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
148
+
149
+ attention_interface: Callable = eager_attention_forward
150
+ if self.config._attn_implementation != "eager":
151
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
152
+
153
+ attn_output, attn_weights = attention_interface(
154
+ self,
155
+ query_states,
156
+ key_states,
157
+ value_states,
158
+ attention_mask,
159
+ dropout=0.0 if not self.training else self.attention_dropout,
160
+ scaling=self.scale,
161
+ is_causal=False,
162
+ **kwargs,
163
+ )
164
+ attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
165
+
166
+ output = self.projection_layer(attn_output)
167
+ output = self.projection_dropout(output)
168
+
169
+ outputs = (output, attn_weights) if output_attentions else (output, None)
170
+ return outputs
171
+
172
+
173
+ @auto_docstring
174
+ class InternS1VisionPreTrainedModel(PreTrainedModel):
175
+ config_class = InternS1VisionConfig
176
+ base_model_prefix = "interns1_vision"
177
+ main_input_name = "pixel_values"
178
+ supports_gradient_checkpointing = True
179
+ _no_split_modules = ["InternS1VisionLayer"]
180
+ _supports_sdpa = True
181
+ _supports_flash_attn = True
182
+ _supports_flex_attn = True
183
+ _supports_attention_backend = True
184
+
185
+ def _init_weights(self, module):
186
+ """Initialize the weights"""
187
+ if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
188
+ # Slightly different from the TF version which uses truncated_normal for initialization
189
+ # cf https://github.com/pytorch/pytorch/pull/5617
190
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
191
+ if module.bias is not None:
192
+ module.bias.data.zero_()
193
+ elif isinstance(module, nn.Embedding):
194
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
195
+ if module.padding_idx is not None:
196
+ module.weight.data[module.padding_idx].zero_()
197
+ elif isinstance(module, nn.LayerNorm):
198
+ module.bias.data.zero_()
199
+ module.weight.data.fill_(1.0)
200
+ elif isinstance(module, InternS1VisionEmbeddings):
201
+ module.cls_token.data.zero_()
202
+ if module.mask_token is not None:
203
+ module.mask_token.data.zero_()
204
+ if module.position_embeddings is not None:
205
+ module.position_embeddings.data.zero_()
206
+ elif isinstance(module, InternS1VisionLayer):
207
+ module.lambda_1.data.fill_(self.config.layer_scale_init_value)
208
+ module.lambda_2.data.fill_(self.config.layer_scale_init_value)
209
+
210
+
211
+ @dataclass
212
+ @auto_docstring(
213
+ custom_intro="""
214
+ Class for outputs of [`InternS1VisionModel`].
215
+ """
216
+ )
217
+ class InternS1VisionModelOutputWithPooling(BaseModelOutputWithPooling):
218
+ r"""
219
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
220
+ Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
221
+ *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
222
+ will be returned.
223
+ """
224
+
225
+
226
+ class InternS1VisionPatchEmbeddings(nn.Module):
227
+ """
228
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
229
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
230
+ Transformer.
231
+ """
232
+
233
+ def __init__(self, config):
234
+ super().__init__()
235
+ image_size, patch_size = config.image_size, config.patch_size
236
+ num_channels, hidden_size = config.num_channels, config.hidden_size
237
+
238
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
239
+ patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
240
+ self.image_size = image_size
241
+ self.patch_size = patch_size
242
+ self.num_channels = num_channels
243
+ self.num_patches = num_patches
244
+ self.patch_shape = patch_shape
245
+
246
+ self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
247
+
248
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
249
+ batch_size, num_channels, height, width = pixel_values.shape
250
+ if num_channels != self.num_channels:
251
+ raise ValueError(
252
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
253
+ )
254
+
255
+ embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
256
+ patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
257
+ embeddings = embeddings.flatten(2).transpose(1, 2)
258
+
259
+ return embeddings, (patch_height, patch_width)
260
+
261
+
262
+ # Based on timm implementation, which can be found here:
263
+ # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
264
+ class InternS1VisionEmbeddings(nn.Module):
265
+ """
266
+ Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
267
+
268
+ """
269
+
270
+ def __init__(self, config: InternS1VisionConfig) -> None:
271
+ super().__init__()
272
+
273
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
274
+ if config.use_mask_token:
275
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
276
+ else:
277
+ self.mask_token = None
278
+ self.patch_embeddings = InternS1VisionPatchEmbeddings(config)
279
+ self.patch_size = config.patch_size
280
+ self.image_size = (
281
+ config.image_size
282
+ if isinstance(config.image_size, collections.abc.Iterable)
283
+ else (config.image_size, config.image_size)
284
+ )
285
+ num_patches = self.patch_embeddings.num_patches
286
+ if config.use_absolute_position_embeddings:
287
+ self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
288
+ else:
289
+ self.position_embeddings = None
290
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
291
+
292
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
293
+ """
294
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
295
+ images. This method is also adapted to support torch.jit tracing.
296
+
297
+ Adapted from:
298
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
299
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
300
+ """
301
+
302
+ num_patches = embeddings.shape[1] - 1
303
+ num_positions = self.position_embeddings.shape[1] - 1
304
+
305
+ # always interpolate when tracing to ensure the exported model works for dynamic input shapes
306
+ if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
307
+ return self.position_embeddings
308
+
309
+ class_pos_embed = self.position_embeddings[:, :1]
310
+ patch_pos_embed = self.position_embeddings[:, 1:]
311
+
312
+ dim = embeddings.shape[-1]
313
+
314
+ new_height = height // self.patch_size[0]
315
+ new_width = width // self.patch_size[1]
316
+
317
+ sqrt_num_positions = torch_int(num_positions ** 0.5)
318
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
319
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
320
+
321
+ patch_pos_embed = nn.functional.interpolate(
322
+ patch_pos_embed,
323
+ size=(new_height, new_width),
324
+ mode="bicubic",
325
+ align_corners=False,
326
+ )
327
+
328
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
329
+
330
+ return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
331
+
332
+ def forward(
333
+ self,
334
+ pixel_values: torch.Tensor,
335
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
336
+ ) -> torch.Tensor:
337
+ _, _, height, width = pixel_values.shape
338
+ embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
339
+ batch_size, seq_len, _ = embeddings.size()
340
+
341
+ if bool_masked_pos is not None:
342
+ mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
343
+ # replace the masked visual tokens by mask_tokens
344
+ w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
345
+ embeddings = embeddings * (1 - w) + mask_tokens * w
346
+
347
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
348
+ embeddings = torch.cat((cls_tokens, embeddings), dim=1)
349
+
350
+ if self.position_embeddings is not None:
351
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
352
+
353
+ embeddings = self.dropout(embeddings)
354
+
355
+ return embeddings, (patch_height, patch_width)
356
+
357
+
358
+ class InternS1VisionMLP(nn.Module):
359
+ def __init__(self, config):
360
+ super().__init__()
361
+ self.config = config
362
+ self.activation_fn = ACT2FN[config.hidden_act]
363
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
364
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
365
+
366
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
367
+ hidden_states = self.fc1(hidden_states)
368
+ hidden_states = self.activation_fn(hidden_states)
369
+ hidden_states = self.fc2(hidden_states)
370
+ return hidden_states
371
+
372
+
373
+ NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternS1VisionRMSNorm}
374
+
375
+
376
+ class InternS1VisionLayer(GradientCheckpointingLayer):
377
+ """This corresponds to the Block class in the timm implementation."""
378
+
379
+ def __init__(self, config: InternS1VisionConfig, drop_path_rate=0.0) -> None:
380
+ super().__init__()
381
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
382
+ self.seq_len_dim = 1
383
+ self.attention = InternS1VisionAttention(config)
384
+ self.mlp = InternS1VisionMLP(config)
385
+ # InternS1 uses different layernorm implementations for different models
386
+ self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
387
+ self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
388
+
389
+ init_values = config.layer_scale_init_value
390
+ self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
391
+ self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
392
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
393
+
394
+ if drop_path_rate > 0.0:
395
+ try:
396
+ from timm.layers import DropPath
397
+ except ImportError:
398
+ raise ImportError("timm is not installed, please install it to use DropPath by 'pip install timm'. ")
399
+ self.drop_path1 = DropPath(drop_path_rate)
400
+ self.drop_path2 = DropPath(drop_path_rate)
401
+ else:
402
+ self.drop_path1 = nn.Identity()
403
+ self.drop_path2 = nn.Identity()
404
+
405
+ def forward(
406
+ self,
407
+ hidden_states: torch.Tensor,
408
+ output_attentions: bool = False,
409
+ ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
410
+ attention_output, attention_weights = self.attention(
411
+ self.layernorm_before(hidden_states), # in InternS1Vision, layernorm is applied before self-attention
412
+ output_attentions=output_attentions,
413
+ )
414
+
415
+ attention_output = self.lambda_1 * attention_output
416
+
417
+ # first residual connection
418
+ hidden_states = self.drop_path1(attention_output) + hidden_states
419
+
420
+ # in InternS1Vision, layernorm is also applied after self-attention
421
+ layer_output = self.layernorm_after(hidden_states)
422
+
423
+ layer_output = self.mlp(layer_output)
424
+ layer_output = self.dropout(layer_output)
425
+
426
+ if self.lambda_2 is not None:
427
+ layer_output = self.lambda_2 * layer_output
428
+
429
+ # second residual connection
430
+ layer_output = self.drop_path2(layer_output) + hidden_states
431
+
432
+ return layer_output, attention_weights
433
+
434
+
435
+ class InternS1VisionEncoder(nn.Module):
436
+ def __init__(self, config: InternS1VisionConfig) -> None:
437
+ super().__init__()
438
+ self.config = config
439
+ dpr = np.linspace(0.0, float(config.drop_path_rate), int(config.num_hidden_layers))
440
+ self.layer = nn.ModuleList([InternS1VisionLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
441
+
442
+ @can_return_tuple
443
+ def forward(
444
+ self,
445
+ hidden_states: torch.Tensor,
446
+ output_attentions: bool = False,
447
+ output_hidden_states: bool = False,
448
+ ) -> Union[tuple, BaseModelOutput]:
449
+ all_hidden_states = () if output_hidden_states else None
450
+ all_self_attentions = () if output_attentions else None
451
+
452
+ for i, layer_module in enumerate(self.layer):
453
+ if output_hidden_states:
454
+ all_hidden_states = all_hidden_states + (hidden_states,)
455
+
456
+ layer_outputs = layer_module(hidden_states, output_attentions)
457
+
458
+ hidden_states = layer_outputs[0]
459
+
460
+ if output_attentions:
461
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
462
+
463
+ if output_hidden_states:
464
+ all_hidden_states = all_hidden_states + (hidden_states,)
465
+
466
+ return BaseModelOutput(
467
+ last_hidden_state=hidden_states,
468
+ hidden_states=all_hidden_states,
469
+ attentions=all_self_attentions,
470
+ )
471
+
472
+
473
+ @auto_docstring
474
+ class InternS1VisionModel(InternS1VisionPreTrainedModel):
475
+ def __init__(self, config: InternS1VisionConfig) -> None:
476
+ super().__init__(config)
477
+ self.config = config
478
+
479
+ self.embeddings = InternS1VisionEmbeddings(config)
480
+ self.encoder = InternS1VisionEncoder(config)
481
+
482
+ self.layernorm = (
483
+ nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
484
+ )
485
+
486
+ # Initialize weights and apply final processing
487
+ self.post_init()
488
+
489
+ def get_input_embeddings(self):
490
+ return self.embeddings.patch_embeddings
491
+
492
+ @can_return_tuple
493
+ @auto_docstring
494
+ def forward(
495
+ self,
496
+ pixel_values: torch.Tensor,
497
+ bool_masked_pos: Optional[torch.BoolTensor] = None,
498
+ output_attentions: Optional[bool] = None,
499
+ output_hidden_states: Optional[bool] = None,
500
+ ) -> Union[tuple, InternS1VisionModelOutputWithPooling]:
501
+ r"""
502
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
503
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
504
+ """
505
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
506
+ output_hidden_states = (
507
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
508
+ )
509
+
510
+ embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
511
+
512
+ encoder_outputs = self.encoder(
513
+ embedding_output,
514
+ output_attentions=output_attentions,
515
+ output_hidden_states=output_hidden_states,
516
+ )
517
+ sequence_output = encoder_outputs[0]
518
+ sequence_output = self.layernorm(sequence_output)
519
+
520
+ return InternS1VisionModelOutputWithPooling(
521
+ last_hidden_state=sequence_output,
522
+ hidden_states=encoder_outputs.hidden_states,
523
+ attentions=encoder_outputs.attentions,
524
+ )
525
+
526
+
527
+ @auto_docstring
528
+ class InternS1PreTrainedModel(PreTrainedModel):
529
+ config_class = InternS1Config
530
+ base_model_prefix = ""
531
+ supports_gradient_checkpointing = True
532
+ _skip_keys_device_placement = "past_key_values"
533
+
534
+ _supports_flash_attn = True
535
+ _supports_sdpa = True
536
+
537
+ _supports_static_cache = True
538
+ _supports_flex_attn = True
539
+ _supports_attention_backend = True
540
+
541
+ def _init_weights(self, module):
542
+ std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
543
+
544
+ if isinstance(module, nn.Linear):
545
+ module.weight.data.normal_(mean=0.0, std=std)
546
+ if module.bias is not None:
547
+ module.bias.data.zero_()
548
+ elif isinstance(module, nn.LayerNorm):
549
+ module.bias.data.zero_()
550
+ module.weight.data.fill_(1.0)
551
+
552
+
553
+ class InternS1MultiModalProjector(nn.Module):
554
+ def __init__(self, config: InternS1Config):
555
+ super().__init__()
556
+ self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
557
+ self.linear_1 = nn.Linear(
558
+ config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
559
+ )
560
+ self.act = ACT2FN[config.projector_hidden_act]
561
+ self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
562
+
563
+ def forward(self, image_features):
564
+ hidden_states = self.layer_norm(image_features)
565
+ hidden_states = self.linear_1(hidden_states)
566
+ hidden_states = self.act(hidden_states)
567
+ hidden_states = self.linear_2(hidden_states)
568
+ return hidden_states
569
+
570
+
571
+ @dataclass
572
+ @auto_docstring(
573
+ custom_intro="""
574
+ Base class for InternS1 outputs, with hidden states and attentions.
575
+ """
576
+ )
577
+ class InternS1ModelOutputWithPast(ModelOutput):
578
+ """
579
+ Base class for model's outputs, with potential hidden states and attentions.
580
+
581
+ Args:
582
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
583
+ Sequence of hidden-states at the output of the last layer of the model.
584
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
585
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
586
+
587
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
588
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
589
+ input) to speed up sequential decoding.
590
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
591
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
592
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
593
+
594
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
595
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
596
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
597
+ sequence_length)`.
598
+
599
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
600
+ heads.
601
+ router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
602
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
603
+
604
+ Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
605
+ loss for Mixture of Experts models.
606
+ image_hidden_states (`torch.FloatTensor`, *optional*):
607
+ A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
608
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
609
+ """
610
+
611
+ last_hidden_state: Optional[torch.FloatTensor] = None
612
+ past_key_values: Optional[Cache] = None
613
+ hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
614
+ attentions: Optional[tuple[torch.FloatTensor, ...]] = None
615
+ router_logits: Optional[tuple[torch.FloatTensor]] = None
616
+ image_hidden_states: Optional[torch.FloatTensor] = None
617
+
618
+
619
+ @auto_docstring(
620
+ custom_intro="""
621
+ The InternS1 model which consists of a vision backbone and a language model, without a language modeling head.
622
+ """
623
+ )
624
+ class InternS1Model(InternS1PreTrainedModel):
625
+ config_class = InternS1Config
626
+
627
+ def __init__(self, config: InternS1Config):
628
+ super().__init__(config)
629
+ self.vision_tower = InternS1VisionModel._from_config(config.vision_config)
630
+
631
+ self.multi_modal_projector = InternS1MultiModalProjector(config)
632
+ self.language_model = AutoModel.from_config(config.text_config)
633
+
634
+ self.is_moe_model = False
635
+ if hasattr(config.text_config, 'output_router_logits'):
636
+ self.is_moe_model = True
637
+
638
+ self.post_init()
639
+
640
+ def get_input_embeddings(self):
641
+ return self.language_model.get_input_embeddings()
642
+
643
+ def set_input_embeddings(self, value):
644
+ self.language_model.set_input_embeddings(value)
645
+
646
+ def set_decoder(self, decoder):
647
+ self.language_model = decoder
648
+
649
+ def get_decoder(self):
650
+ return self.language_model
651
+
652
+ def get_image_features(
653
+ self,
654
+ pixel_values: torch.FloatTensor,
655
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
656
+ vision_feature_select_strategy: Optional[str] = None,
657
+ **kwargs,
658
+ ):
659
+ """
660
+ Obtains image last hidden states from the vision tower and apply multimodal projection.
661
+
662
+ Args:
663
+ pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
664
+ The tensors corresponding to the input images.
665
+ vision_feature_layer (`int` or `list[int]`):
666
+ Layer index or list of layer indices to extract features from.
667
+ Returns:
668
+ vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
669
+ """
670
+ vision_feature_layer = (
671
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
672
+ )
673
+ vision_feature_select_strategy = (
674
+ vision_feature_select_strategy
675
+ if vision_feature_select_strategy is not None
676
+ else self.config.vision_feature_select_strategy
677
+ )
678
+
679
+ downsample_ratio = self.config.downsample_ratio
680
+ if vision_feature_layer == -1:
681
+ vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state
682
+ else:
683
+ vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer]
684
+ if vision_feature_select_strategy == "default":
685
+ vision_features = vision_features[:, 1:, :]
686
+
687
+ # Calculate dimensions based on vision features
688
+ channels = vision_features.shape[1]
689
+ feature_size = int(channels ** 0.5)
690
+ batch_size = vision_features.shape[0]
691
+
692
+ # Reshape tensor to spatial dimensions
693
+ vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)
694
+
695
+ # Apply downsampling using pixel shuffle
696
+ vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)
697
+
698
+ # Reshape tensor to prepare for projection
699
+ vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])
700
+
701
+ # Project features through multi-modal projector
702
+ vision_features = self.multi_modal_projector(vision_features)
703
+ return vision_features
704
+
705
+ @can_return_tuple
706
+ @auto_docstring
707
+ def forward(
708
+ self,
709
+ input_ids: torch.LongTensor = None,
710
+ pixel_values: torch.FloatTensor = None,
711
+ attention_mask: Optional[torch.Tensor] = None,
712
+ position_ids: Optional[torch.LongTensor] = None,
713
+ past_key_values: Optional[Cache] = None,
714
+ inputs_embeds: Optional[torch.FloatTensor] = None,
715
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
716
+ vision_feature_select_strategy: Optional[str] = None,
717
+ use_cache: Optional[bool] = None,
718
+ output_attentions: Optional[bool] = None,
719
+ output_hidden_states: Optional[bool] = None,
720
+ output_router_logits: Optional[bool] = None,
721
+ return_dict: Optional[bool] = None,
722
+ cache_position: Optional[torch.LongTensor] = None,
723
+ **kwargs: Unpack[FlashAttentionKwargs],
724
+ ) -> InternS1ModelOutputWithPast:
725
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
726
+ output_hidden_states = (
727
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
728
+ )
729
+ if self.is_moe_model:
730
+ output_router_logits = (
731
+ output_router_logits if output_router_logits is not None else self.config.text_config.output_router_logits
732
+ )
733
+ kwargs['output_router_logits'] = output_router_logits
734
+
735
+ vision_feature_layer = (
736
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
737
+ )
738
+ vision_feature_select_strategy = (
739
+ vision_feature_select_strategy
740
+ if vision_feature_select_strategy is not None
741
+ else self.config.vision_feature_select_strategy
742
+ )
743
+
744
+ if (input_ids is None) ^ (inputs_embeds is not None):
745
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
746
+
747
+ if inputs_embeds is None:
748
+ inputs_embeds = self.get_input_embeddings()(input_ids)
749
+
750
+ if pixel_values is not None:
751
+ image_features = self.get_image_features(
752
+ pixel_values=pixel_values,
753
+ vision_feature_layer=vision_feature_layer,
754
+ vision_feature_select_strategy=vision_feature_select_strategy,
755
+ )
756
+
757
+ if input_ids is None:
758
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
759
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
760
+ )
761
+ special_image_mask = special_image_mask.all(-1)
762
+ else:
763
+ special_image_mask = input_ids == self.config.image_token_id
764
+
765
+ n_image_tokens = (special_image_mask).sum()
766
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
767
+
768
+ if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
769
+ n_image_features = image_features.shape[0] * image_features.shape[1]
770
+ raise ValueError(
771
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
772
+ )
773
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
774
+ inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
775
+
776
+ outputs = self.language_model(
777
+ attention_mask=attention_mask,
778
+ position_ids=position_ids,
779
+ past_key_values=past_key_values,
780
+ inputs_embeds=inputs_embeds,
781
+ use_cache=use_cache,
782
+ output_attentions=output_attentions,
783
+ output_hidden_states=output_hidden_states,
784
+ cache_position=cache_position,
785
+ **kwargs,
786
+ )
787
+
788
+ return InternS1ModelOutputWithPast(
789
+ last_hidden_state=outputs.last_hidden_state,
790
+ past_key_values=outputs.past_key_values,
791
+ hidden_states=outputs.hidden_states,
792
+ attentions=outputs.attentions,
793
+ router_logits=outputs.router_logits if self.is_moe_model else None,
794
+ image_hidden_states=image_features if pixel_values is not None else None,
795
+ )
796
+
797
+ def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
798
+ """Perform pixel shuffle downsampling on vision features.
799
+
800
+ Args:
801
+ vision_features (`torch.Tensor`):
802
+ Input tensor of shape (batch_size, width, height, channels).
803
+ scale_factor (`float`, *optional*, defaults to `0.5`):
804
+ Factor by which to downsample. Default is 0.5, which halves the dimensions.
805
+
806
+ Returns:
807
+ vision_features (`torch.Tensor`):
808
+ Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
809
+ """
810
+ batch_size, width, height, channels = vision_features.size()
811
+
812
+ if height % scale_factor != 0 or width % scale_factor != 0:
813
+ raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")
814
+
815
+ # Reshape to allow downsampling
816
+ vision_features = vision_features.view(
817
+ batch_size, width, int(height * scale_factor), int(channels / scale_factor)
818
+ )
819
+ # Permute dimensions to align downsampled axis correctly
820
+ vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
821
+
822
+ # Reshape to achieve final downsampled dimensions
823
+ vision_features = vision_features.view(
824
+ batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor ** 2))
825
+ )
826
+
827
+ # Swap height and width back for proper orientation
828
+ vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
829
+
830
+ return vision_features
831
+
832
+
833
+ @dataclass
834
+ @auto_docstring(
835
+ custom_intro="""
836
+ Base class for InternS1 causal language model (or autoregressive) outputs.
837
+ """
838
+ )
839
+ class InternS1CausalLMOutputWithPast(ModelOutput):
840
+ """
841
+ Base class for causal language model (or autoregressive) with mixture of experts outputs.
842
+
843
+ Args:
844
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
845
+ Language modeling loss (for next-token prediction).
846
+
847
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
848
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
849
+
850
+ aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
851
+ aux_loss for the sparse modules.
852
+
853
+ router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
854
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
855
+
856
+ Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
857
+ loss for Mixture of Experts models.
858
+
859
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
860
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
861
+
862
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
863
+ `past_key_values` input) to speed up sequential decoding.
864
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
865
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
866
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
867
+
868
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
869
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
870
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
871
+ sequence_length)`.
872
+
873
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
874
+ heads.
875
+ image_hidden_states (`torch.FloatTensor`, *optional*):
876
+ A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
877
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
878
+ """
879
+
880
+ loss: Optional[torch.FloatTensor] = None
881
+ aux_loss: Optional[torch.FloatTensor] = None
882
+ logits: Optional[torch.FloatTensor] = None
883
+ past_key_values: Optional[Cache] = None
884
+ hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
885
+ attentions: Optional[tuple[torch.FloatTensor, ...]] = None
886
+ router_logits: Optional[tuple[torch.FloatTensor]] = None
887
+ image_hidden_states: Optional[torch.FloatTensor] = None
888
+
889
+
890
+ def load_balancing_loss_func(
891
+ gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
892
+ num_experts: Optional[int] = None,
893
+ top_k=2,
894
+ attention_mask: Optional[torch.Tensor] = None,
895
+ ) -> Union[torch.Tensor, int]:
896
+ r"""
897
+ Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
898
+
899
+ See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
900
+ function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
901
+ experts is too unbalanced.
902
+
903
+ Args:
904
+ gate_logits:
905
+ Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
906
+ shape [batch_size X sequence_length, num_experts].
907
+ num_experts:
908
+ Number of experts
909
+ top_k:
910
+ The number of experts to route per-token, can be also interpreted as the `top-k` routing
911
+ parameter.
912
+ attention_mask (`torch.Tensor`, *optional*):
913
+ The attention_mask used in forward function
914
+ shape [batch_size X sequence_length] if not None.
915
+
916
+ Returns:
917
+ The auxiliary loss.
918
+ """
919
+ if gate_logits is None or not isinstance(gate_logits, tuple):
920
+ return 0
921
+
922
+ if isinstance(gate_logits, tuple):
923
+ compute_device = gate_logits[0].device
924
+ concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
925
+
926
+ routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
927
+
928
+ _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
929
+
930
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
931
+
932
+ if attention_mask is None:
933
+ # Compute the percentage of tokens routed to each experts
934
+ tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
935
+
936
+ # Compute the average probability of routing to these experts
937
+ router_prob_per_expert = torch.mean(routing_weights, dim=0)
938
+ else:
939
+ batch_size, sequence_length = attention_mask.shape
940
+ num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
941
+
942
+ # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
943
+ expert_attention_mask = (
944
+ attention_mask[None, :, :, None, None]
945
+ .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
946
+ .reshape(-1, top_k, num_experts)
947
+ .to(compute_device)
948
+ )
949
+
950
+ # Compute the percentage of tokens routed to each experts
951
+ tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
952
+ expert_attention_mask, dim=0
953
+ )
954
+
955
+ # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
956
+ router_per_expert_attention_mask = (
957
+ attention_mask[None, :, :, None]
958
+ .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
959
+ .reshape(-1, num_experts)
960
+ .to(compute_device)
961
+ )
962
+
963
+ # Compute the average probability of routing to these experts
964
+ router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
965
+ router_per_expert_attention_mask, dim=0
966
+ )
967
+
968
+ overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
969
+ return overall_loss * num_experts
970
+
971
+
972
+ @auto_docstring(
973
+ custom_intro="""
974
+ The INTERNS1 model which consists of a vision backbone and a language model.
975
+ """
976
+ )
977
+ class InternS1ForConditionalGeneration(InternS1PreTrainedModel, GenerationMixin):
978
+ config_class = InternS1Config
979
+ _tied_weights_keys = ["lm_head.weight"]
980
+
981
+ def __init__(self, config: InternS1Config):
982
+ super().__init__(config)
983
+ self.model = InternS1Model(config)
984
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
985
+
986
+ self.is_moe_model = False
987
+ if hasattr(config.text_config, 'output_router_logits'):
988
+ self.is_moe_model = True
989
+ self.post_init()
990
+
991
+ def get_input_embeddings(self):
992
+ return self.model.get_input_embeddings()
993
+
994
+ def set_input_embeddings(self, value):
995
+ self.model.set_input_embeddings(value)
996
+
997
+ def get_output_embeddings(self) -> nn.Module:
998
+ return self.lm_head
999
+
1000
+ def set_output_embeddings(self, new_embeddings):
1001
+ self.lm_head = new_embeddings
1002
+
1003
+ def set_decoder(self, decoder):
1004
+ self.model.set_decoder(decoder)
1005
+
1006
+ def get_decoder(self):
1007
+ return self.model.get_decoder
1008
+
1009
+ def get_image_features(
1010
+ self,
1011
+ pixel_values: torch.FloatTensor,
1012
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
1013
+ vision_feature_select_strategy: Optional[str] = None,
1014
+ **kwargs,
1015
+ ):
1016
+ return self.model.get_image_features(
1017
+ pixel_values=pixel_values,
1018
+ vision_feature_layer=vision_feature_layer,
1019
+ vision_feature_select_strategy=vision_feature_select_strategy,
1020
+ **kwargs,
1021
+ )
1022
+
1023
+ # Make modules available throught conditional class for BC
1024
+ @property
1025
+ def language_model(self):
1026
+ return self.model.language_model
1027
+
1028
+ @property
1029
+ def vision_tower(self):
1030
+ return self.model.vision_tower
1031
+
1032
+ @property
1033
+ def multi_modal_projector(self):
1034
+ return self.model.multi_modal_projector
1035
+
1036
+ @can_return_tuple
1037
+ @auto_docstring
1038
+ def forward(
1039
+ self,
1040
+ input_ids: torch.LongTensor = None,
1041
+ pixel_values: torch.FloatTensor = None,
1042
+ attention_mask: Optional[torch.Tensor] = None,
1043
+ position_ids: Optional[torch.LongTensor] = None,
1044
+ past_key_values: Optional[Cache] = None,
1045
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1046
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
1047
+ vision_feature_select_strategy: Optional[str] = None,
1048
+ labels: Optional[torch.LongTensor] = None,
1049
+ use_cache: Optional[bool] = None,
1050
+ output_attentions: Optional[bool] = None,
1051
+ output_hidden_states: Optional[bool] = None,
1052
+ output_router_logits: Optional[bool] = None,
1053
+ return_dict: Optional[bool] = None,
1054
+ cache_position: Optional[torch.LongTensor] = None,
1055
+ logits_to_keep: Union[int, torch.Tensor] = 0,
1056
+ image_sizes: Optional[torch.Tensor] = None,
1057
+ **kwargs,
1058
+ ) -> Union[tuple, InternS1CausalLMOutputWithPast]:
1059
+ r"""
1060
+ Example:
1061
+
1062
+ ```python
1063
+ >>> import torch
1064
+ >>> from transformers import AutoProcessor, AutoModelForImageTextToText
1065
+
1066
+ >>> torch_device = "cuda"
1067
+ >>> processor = AutoProcessor.from_pretrained("InternLM/InternS1") # todo
1068
+ >>> model = AutoModelForImageTextToText.from_pretrained(
1069
+ ... "InternLM/InternS1", torch_dtype=torch.bfloat16, device_map=torch_device
1070
+ ... )
1071
+
1072
+ >>> messages = [
1073
+ ... {
1074
+ ... "role": "user",
1075
+ ... "content": [
1076
+ ... {
1077
+ ... "type": "image",
1078
+ ... "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
1079
+ ... },
1080
+ ... {
1081
+ ... "type": "image",
1082
+ ... "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
1083
+ ... },
1084
+ ... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
1085
+ ... ],
1086
+ ... },
1087
+ ... ]
1088
+
1089
+ >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
1090
+ >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
1091
+ >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
1092
+ The images depict the Statue of Liberty and the Golden Gate Bridge.
1093
+ ```"""
1094
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1095
+ output_hidden_states = (
1096
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1097
+ )
1098
+
1099
+ if self.is_moe_model:
1100
+ output_router_logits = (
1101
+ output_router_logits if output_router_logits is not None else self.config.text_config.output_router_logits
1102
+ )
1103
+ kwargs['output_router_logits'] = output_router_logits
1104
+
1105
+ vision_feature_layer = (
1106
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
1107
+ )
1108
+ vision_feature_select_strategy = (
1109
+ vision_feature_select_strategy
1110
+ if vision_feature_select_strategy is not None
1111
+ else self.config.vision_feature_select_strategy
1112
+ )
1113
+
1114
+ outputs = self.model(
1115
+ input_ids=input_ids,
1116
+ pixel_values=pixel_values,
1117
+ attention_mask=attention_mask,
1118
+ position_ids=position_ids,
1119
+ past_key_values=past_key_values,
1120
+ inputs_embeds=inputs_embeds,
1121
+ vision_feature_layer=vision_feature_layer,
1122
+ vision_feature_select_strategy=vision_feature_select_strategy,
1123
+ use_cache=use_cache,
1124
+ output_attentions=output_attentions,
1125
+ output_hidden_states=output_hidden_states,
1126
+ cache_position=cache_position,
1127
+ image_sizes=image_sizes,
1128
+ **kwargs,
1129
+ )
1130
+
1131
+ hidden_states = outputs.last_hidden_state
1132
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1133
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1134
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1135
+
1136
+ loss = None
1137
+ if labels is not None:
1138
+ loss = self.loss_function(
1139
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
1140
+ )
1141
+
1142
+ aux_loss = None
1143
+ if self.is_moe_model and output_router_logits and labels is not None:
1144
+ aux_loss = load_balancing_loss_func(
1145
+ outputs.router_logits,
1146
+ self.config.text_config.num_experts,
1147
+ self.config.text_config.num_experts_per_tok,
1148
+ attention_mask,
1149
+ )
1150
+ loss += self.config.text_config.router_aux_loss_coef * aux_loss.to(loss.device)
1151
+
1152
+ return InternS1CausalLMOutputWithPast(
1153
+ loss=loss,
1154
+ aux_loss=aux_loss,
1155
+ logits=logits,
1156
+ past_key_values=outputs.past_key_values,
1157
+ hidden_states=outputs.hidden_states,
1158
+ attentions=outputs.attentions,
1159
+ router_logits=outputs.router_logits if self.is_moe_model else None,
1160
+ image_hidden_states=outputs.image_hidden_states,
1161
+ )
1162
+
1163
+ def prepare_inputs_for_generation(
1164
+ self,
1165
+ input_ids,
1166
+ past_key_values=None,
1167
+ inputs_embeds=None,
1168
+ pixel_values=None,
1169
+ attention_mask=None,
1170
+ cache_position=None,
1171
+ logits_to_keep=None,
1172
+ **kwargs,
1173
+ ):
1174
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
1175
+
1176
+ model_inputs = super().prepare_inputs_for_generation(
1177
+ input_ids,
1178
+ past_key_values=past_key_values,
1179
+ inputs_embeds=inputs_embeds,
1180
+ attention_mask=attention_mask,
1181
+ cache_position=cache_position,
1182
+ logits_to_keep=logits_to_keep,
1183
+ **kwargs,
1184
+ )
1185
+
1186
+ if cache_position[0] == 0:
1187
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
1188
+ # Otherwise we need pixel values to be passed to model
1189
+ model_inputs["pixel_values"] = pixel_values
1190
+
1191
+ return model_inputs
1192
+
1193
+
1194
+ __all__ = [
1195
+ "InternS1VisionPreTrainedModel",
1196
+ "InternS1VisionModel",
1197
+ "InternS1PreTrainedModel",
1198
+ "InternS1Model",
1199
+ "InternS1ForConditionalGeneration",
1200
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "crop_to_patches": false,
4
+ "data_format": "channels_first",
5
+ "default_to_square": true,
6
+ "device": null,
7
+ "disable_grouping": null,
8
+ "do_center_crop": null,
9
+ "do_convert_rgb": true,
10
+ "do_normalize": true,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_mean": [
14
+ 0.485,
15
+ 0.456,
16
+ 0.406
17
+ ],
18
+ "image_processor_type": "GotOcr2ImageProcessorFast",
19
+ "image_std": [
20
+ 0.229,
21
+ 0.224,
22
+ 0.225
23
+ ],
24
+ "input_data_format": null,
25
+ "max_patches": 12,
26
+ "min_patches": 1,
27
+ "processor_class": "InternS1Processor",
28
+ "resample": 3,
29
+ "rescale_factor": 0.00392156862745098,
30
+ "return_tensors": null,
31
+ "size": {
32
+ "height": 448,
33
+ "width": 448
34
+ }
35
+ }
processing_interns1.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Optional, Union
17
+
18
+ import numpy as np
19
+
20
+ from transformers.image_processing_utils import BatchFeature
21
+ from transformers.image_utils import ImageInput, concatenate_list, make_flat_list_of_images
22
+ from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
23
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
24
+ from transformers.video_utils import VideoInput, make_batched_videos
25
+
26
+
27
+ class InternS1ImagesKwargs(ImagesKwargs, total=False):
28
+ crop_to_patches: Optional[bool]
29
+ min_patches: Optional[int]
30
+ max_patches: Optional[int]
31
+
32
+
33
+ class InternS1ProcessorKwargs(ProcessingKwargs, total=False):
34
+ images_kwargs: InternS1ImagesKwargs
35
+ _defaults = {
36
+ "text_kwargs": {
37
+ "padding_side": "left",
38
+ "return_mm_token_type_ids": False,
39
+ },
40
+ "images_kwargs": {
41
+ "crop_to_patches": True,
42
+ },
43
+ "videos_kwargs": {},
44
+ }
45
+
46
+
47
+ class InternS1Processor(ProcessorMixin):
48
+ r"""
49
+ Constructs a InternS1 processor which wraps a [`AutoImageProcessor`] and
50
+ [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
51
+ tokenizer functionalities. See the [`~InternS1Processor.__call__`] and [`~InternS1Processor.decode`] for more information.
52
+ Args:
53
+ image_processor ([`AutoImageProcessor`], *optional*):
54
+ The image processor is a required input.
55
+ tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
56
+ The tokenizer is a required input.
57
+ video_processor ([`AutoVideoProcessor`], *optional*):
58
+ The video processor is a required input.
59
+ image_seq_length (`int`, *optional*, defaults to 256):
60
+ The number of image token to use per image patch. it should be set so that:
61
+ image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
62
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
63
+ in a chat into a tokenizable string.
64
+ """
65
+
66
+ attributes = ["image_processor", "tokenizer", "video_processor"]
67
+ image_processor_class = "AutoImageProcessor"
68
+ video_processor_class = "AutoVideoProcessor"
69
+ tokenizer_class = "AutoTokenizer"
70
+
71
+ def __init__(
72
+ self,
73
+ image_processor=None,
74
+ tokenizer=None,
75
+ video_processor=None,
76
+ image_seq_length: int = 256,
77
+ chat_template=None,
78
+ **kwargs,
79
+ ):
80
+ self.image_seq_length = image_seq_length
81
+ self.start_image_token = tokenizer.start_image_token
82
+ self.end_image_token = tokenizer.end_image_token
83
+ self.start_image_token_id = tokenizer.start_image_token_id
84
+ self.end_image_token_id = tokenizer.end_image_token_id
85
+ self.image_token = tokenizer.context_image_token
86
+ self.video_token = tokenizer.video_token
87
+ self.image_token_id = tokenizer.context_image_token_id
88
+ self.image_ids = [self.image_token_id, self.start_image_token_id, self.end_image_token_id]
89
+
90
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs)
91
+
92
+ def _insert_media_placeholders(
93
+ self,
94
+ text: list[str],
95
+ image_pixel_values,
96
+ video_pixel_values,
97
+ image_num_patches: list[int],
98
+ video_num_patches: list[int],
99
+ image_num_patches_indices: np.ndarray,
100
+ video_num_patches_indices: np.ndarray,
101
+ video_patch_indices: np.ndarray,
102
+ ):
103
+ """
104
+ Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
105
+ image and video tokens while keeping track of the patches used.
106
+ """
107
+ image_index = 0
108
+ video_index = 0
109
+ processed_text = []
110
+ image_video_patches = []
111
+ replace_strings = []
112
+ # Support interleaved image and video in prompts:
113
+ # Processed patches of images and videos are inserted in `image_video_patches` in the order they appear in the prompts
114
+ for prompt in text:
115
+ new_prompt = prompt
116
+ while self.image_token in new_prompt or self.video_token in new_prompt:
117
+ if self.image_token in new_prompt and (
118
+ self.video_token not in new_prompt
119
+ or new_prompt.index(self.image_token) < new_prompt.index(self.video_token)
120
+ ):
121
+ # Get the slice of patches corresponding to the current image
122
+ start_index = image_num_patches_indices[image_index - 1] if image_index > 0 else 0
123
+ end_index = image_num_patches_indices[image_index]
124
+ image_video_patches.append(image_pixel_values[start_index:end_index])
125
+ # Replace the corresponding image placeholder with the correct number of image tokens
126
+ new_prompt = new_prompt.replace(self.image_token, "<placeholder>", 1)
127
+ replace_strings.append(
128
+ f"{self.start_image_token}{self.image_token * self.image_seq_length * image_num_patches[image_index]}{self.end_image_token}"
129
+ )
130
+ image_index += 1
131
+ else:
132
+ # Get the slice of patches corresponding to the current video
133
+ # Here we need to account for both the multiple video frames and the potential multiple patches per frame
134
+ # As of now, InternS1 only supports one patch per frame, but we keep the code flexible for future updates
135
+ current_patch_index = video_patch_indices[video_index - 1] if video_index > 0 else 0
136
+ end_patch_index = video_patch_indices[video_index]
137
+ start_index = video_num_patches_indices[current_patch_index] if video_index > 0 else 0
138
+ end_index = video_num_patches_indices[end_patch_index - 1]
139
+ image_video_patches.append(video_pixel_values[start_index:end_index])
140
+ # Get the number of patches per frame and replace the video placeholder with the correct number of image tokens
141
+ num_patches = list(video_num_patches[current_patch_index:end_patch_index])
142
+ video_prompt = "\n".join(
143
+ f"Frame{i + 1}: {self.start_image_token}{self.image_token * self.image_seq_length * num_patches[i]}{self.end_image_token}"
144
+ for i in range(len(num_patches))
145
+ )
146
+ replace_strings.append(video_prompt)
147
+ new_prompt = new_prompt.replace(self.video_token, "<placeholder>", 1)
148
+ video_index += 1
149
+ while "<placeholder>" in new_prompt:
150
+ replace_str = replace_strings.pop(0)
151
+ new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
152
+ processed_text.append(new_prompt)
153
+
154
+ return processed_text, image_video_patches, image_index, video_index
155
+
156
+ def __call__(
157
+ self,
158
+ images: Optional[ImageInput] = None,
159
+ text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
160
+ audio=None,
161
+ videos: Optional[VideoInput] = None,
162
+ **kwargs: Unpack[InternS1ProcessorKwargs],
163
+ ) -> BatchFeature:
164
+ """
165
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
166
+ and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
167
+ is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
168
+ `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
169
+ GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
170
+
171
+ Args:
172
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
173
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
174
+ tensor. Both channels-first and channels-last formats are supported.
175
+ text (`str`, `list[str]`, `list[list[str]]`):
176
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
177
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
178
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
179
+ videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
180
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
181
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
182
+ If set, will return tensors of a particular framework. Acceptable values are:
183
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
184
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
185
+ - `'np'`: Return NumPy `np.ndarray` objects.
186
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
187
+
188
+ Returns:
189
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
190
+
191
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
192
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
193
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
194
+ `None`).
195
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
196
+ """
197
+ if text is None:
198
+ raise ValueError("You have to specify text.")
199
+
200
+ output_kwargs = self._merge_kwargs(
201
+ InternS1ProcessorKwargs,
202
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
203
+ **kwargs,
204
+ )
205
+
206
+ if not isinstance(text, (list, tuple)):
207
+ text = [text]
208
+
209
+ # Process images and videos separately, as videos don't support crop_to_patches
210
+ image_num_patches = []
211
+ video_num_patches = []
212
+ image_videos_inputs = {}
213
+ image_pixel_values = None
214
+ video_pixel_values = None
215
+ image_num_patches_indices = np.array([0])
216
+ video_patch_indices = np.array([0])
217
+ video_num_patches_indices = np.array([0])
218
+ if images is not None:
219
+ images = make_flat_list_of_images(images)
220
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
221
+ image_num_patches = image_inputs.pop("num_patches")
222
+ image_pixel_values = image_inputs.pop("pixel_values")
223
+ image_num_patches_indices = np.cumsum(image_num_patches)
224
+ if videos is not None:
225
+ videos = make_batched_videos(videos)
226
+ video_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
227
+ video_pixel_values = video_inputs.pop("pixel_values_videos")
228
+
229
+ # Obtain per frame information first and then flatten to (BS * T, ...)
230
+ num_frames_per_video = [len(video) for video in video_pixel_values]
231
+ video_num_patches = [1 for frames in num_frames_per_video for _ in range(frames)]
232
+ video_patch_indices = np.cumsum(num_frames_per_video)
233
+ video_num_patches_indices = np.cumsum(video_num_patches)
234
+ video_pixel_values = video_pixel_values.flatten(0, 1)
235
+
236
+ if images is not None or videos is not None:
237
+ text, image_video_patches, image_index, video_index = self._insert_media_placeholders(
238
+ text,
239
+ image_pixel_values,
240
+ video_pixel_values,
241
+ image_num_patches,
242
+ video_num_patches,
243
+ image_num_patches_indices,
244
+ video_num_patches_indices,
245
+ video_patch_indices,
246
+ )
247
+ if images is not None and image_index != len(images):
248
+ raise ValueError("Number of image placeholders in the prompt does not match the number of images.")
249
+ if videos is not None and video_index != len(videos):
250
+ raise ValueError("Number of video placeholders in the prompt does not match the number of videos.")
251
+
252
+ # Concatenate the interleaved image and video patches (function agnostic to the patches type (list, numpy array, torch tensor))
253
+ image_videos_inputs = {"pixel_values": concatenate_list(image_video_patches)}
254
+
255
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
256
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
257
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
258
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
259
+
260
+ if return_mm_token_type_ids:
261
+ array_ids = np.array(text_inputs["input_ids"])
262
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
263
+ mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
264
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
265
+
266
+ return BatchFeature(data={**text_inputs, **image_videos_inputs}, tensor_type=return_tensors)
267
+
268
+ def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
269
+ """
270
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
271
+
272
+ Args:
273
+ image_sizes (`list[list[int]]`, *optional*):
274
+ The input sizes formatted as (height, width) per each image.
275
+
276
+ Returns:
277
+ `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
278
+ input modalities, along with other useful data.
279
+ """
280
+
281
+ vision_data = {}
282
+ if image_sizes is not None:
283
+ images_kwargs = InternS1ProcessorKwargs._defaults.get("images_kwargs", {})
284
+ images_kwargs.update(kwargs)
285
+
286
+ num_image_patches = [
287
+ self.image_processor.get_number_of_image_tokens(*image_size, images_kwargs)
288
+ for image_size in image_sizes
289
+ ]
290
+ # Add 2 for BOI and EOI tokens
291
+ num_image_tokens = [2 + (self.image_seq_length * num_patches) for num_patches in num_image_patches]
292
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
293
+
294
+ return MultiModalData(**vision_data)
295
+
296
+ def batch_decode(self, *args, **kwargs):
297
+ """
298
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
299
+ refer to the docstring of this method for more information.
300
+ """
301
+ return self.tokenizer.batch_decode(*args, **kwargs)
302
+
303
+ def decode(self, *args, **kwargs):
304
+ """
305
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
306
+ the docstring of this method for more information.
307
+ """
308
+ return self.tokenizer.decode(*args, **kwargs)
309
+
310
+ @property
311
+ def model_input_names(self):
312
+ tokenizer_input_names = self.tokenizer.model_input_names
313
+ image_processor_input_names = self.image_processor.model_input_names
314
+ return list(tokenizer_input_names) + list(image_processor_input_names)
315
+
316
+
317
+ __all__ = ["InternS1Processor"]
processor_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "InternS1Processor",
4
+ "auto_map": {
5
+ "AutoProcessor": "processing_interns1.InternS1Processor"
6
+ }
7
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "context_image_token": "<IMG_CONTEXT>",
18
+ "end_image_token": "</img>",
19
+ "eos_token": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "pad_token": {
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "start_image_token": "<img>",
34
+ "video_token": "<video>"
35
+ }
tokenization_interns1.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Intern team and Shanghai AI Lab team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for InternS1."""
16
+
17
+ from typing import Union, Dict, List, Optional, Tuple
18
+ import json
19
+ import os
20
+ from functools import lru_cache
21
+ from abc import ABC, abstractmethod
22
+ import regex as re
23
+
24
+ import sentencepiece as spm
25
+ from collections import OrderedDict
26
+
27
+ from transformers.tokenization_utils_base import AddedToken, TextInput
28
+ from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
29
+ from transformers.utils import logging
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ try:
35
+ from rdkit import Chem
36
+ from rdkit import RDLogger
37
+
38
+ RDLogger.DisableLog("rdApp.error")
39
+ RDLogger.DisableLog("rdApp.*")
40
+ RDKIT_AVAILABLE = True
41
+ except ImportError:
42
+ logger.warning_once(
43
+ f"If tokenization with SMILES formula is of necessity, please 'pip install RDKit' for better tokenization quality."
44
+ )
45
+ RDKIT_AVAILABLE = False
46
+
47
+ VOCAB_FILES_NAMES = {
48
+ "vocab_file": "vocab.json",
49
+ "merges_file": "merges.txt",
50
+ "sp_model_SMILES": "tokenizer_SMILES.model",
51
+ "sp_model_IUPAC": "tokenizer_IUPAC.model",
52
+ "sp_model_FASTA": "tokenizer_FASTA.model",
53
+ }
54
+
55
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
56
+
57
+
58
+ class InternS1CheckModuleMixin(ABC):
59
+ """
60
+ Basic auto-detection module.
61
+
62
+ Note that short strings are ignored by this module.
63
+ """
64
+ def __init__(self, *, min_length: int):
65
+ self.min_length = min_length
66
+ self.REGEX = self._build_regex()
67
+ self.auto_detect_token = []
68
+ self.truncation = False
69
+
70
+ @abstractmethod
71
+ def _build_regex(self):
72
+ pass
73
+
74
+ @abstractmethod
75
+ def check_legitimacy(self, candidate: str) -> bool:
76
+ pass
77
+
78
+ def re_split(self, texts: Union[str, List[str]]) -> List[str]:
79
+ if isinstance(texts, str):
80
+ texts = [texts]
81
+
82
+ total_results = []
83
+
84
+ for text in texts:
85
+ results = []
86
+ current_pos = 0
87
+ for match in self.REGEX.finditer(text):
88
+ candidate = match.group(1)
89
+
90
+ if len(candidate) >= self.min_length:
91
+ match_start, match_end = match.span(1)
92
+
93
+ if not self.check_legitimacy(candidate):
94
+ continue
95
+
96
+ if not self.truncation:
97
+ if match_start > 0 and text[match_start - 1].encode("UTF-8").isalpha():
98
+ continue
99
+ if match_end < len(text) and text[match_end].encode("UTF-8").isalpha():
100
+ continue
101
+
102
+ if match_start > current_pos:
103
+ non_candidate_part = text[current_pos:match_start]
104
+ results.append(non_candidate_part)
105
+ else:
106
+ continue
107
+
108
+ results.extend([self.auto_detect_token[0], candidate, self.auto_detect_token[1]])
109
+ current_pos = match_end
110
+
111
+ if current_pos < len(text):
112
+ remaining_part = text[current_pos:]
113
+ results.append(remaining_part)
114
+
115
+ total_results.extend(results)
116
+
117
+ return total_results
118
+
119
+
120
+ class FastaCheckModule(InternS1CheckModuleMixin):
121
+ """
122
+ Protein sequence auto-detection module.
123
+
124
+ Automatically detects protein sequence using regex patterns.
125
+ """
126
+ def __init__(self, *, min_length: int = 27):
127
+ super().__init__(min_length=min_length)
128
+ self.auto_detect_token = ["<FASTA_AUTO_DETECT>", "</FASTA_AUTO_DETECT>"]
129
+ self.truncation = True
130
+
131
+ def _build_regex(self):
132
+ return re.compile(r"([A-Z]{" + str(self.min_length) + r",})")
133
+
134
+ def check_legitimacy(self, candidate: str):
135
+ return True
136
+
137
+
138
+ bonds = ["-", "=", "#", ":", "/", "\\", ".", "$"]
139
+ organic_symbols = ["B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"]
140
+ other_allows = bonds + ["[", "]", "(", ")", ";"]
141
+ aromatic_symbols = ["b", "c", "n", "o", "s", "p"]
142
+ elements = [
143
+ "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
144
+ "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
145
+ "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
146
+ "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr",
147
+ "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
148
+ "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
149
+ "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
150
+ "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
151
+ "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
152
+ "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
153
+ "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
154
+ "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
155
+ ]
156
+
157
+
158
+ class SmilesCheckModule(InternS1CheckModuleMixin):
159
+ """
160
+ SMILES molecular sequence auto-detection module.
161
+
162
+ Automatically detects and validates SMILES strings in text using regex patterns
163
+ or chemical syntax rules. Uses RDKit for precise validation when available,
164
+ otherwise falls back to rule-based validation.
165
+ """
166
+ def __init__(self, *, min_length: int = 10):
167
+ super().__init__(min_length=min_length)
168
+ self.auto_detect_token = ["<SMILES_AUTO_DETECT>", "</SMILES_AUTO_DETECT>"]
169
+ self._SQ_BRACKET_BAN_1 = re.compile(r'(?:[A-GI-Z]|[a-z]){3,}')
170
+ self._SQ_BRACKET_BAN_2 = re.compile(r'\d{4,}')
171
+
172
+ def _build_regex(self):
173
+ _two_letter_elements = [
174
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'Ba', 'Be', 'Bh', 'Bi', 'Bk', 'Br', 'Ca', 'Cd',
175
+ 'Ce', 'Cf', 'Cl', 'Cm', 'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'Fe',
176
+ 'Fl', 'Fm', 'Fr', 'Ga', 'Gd', 'Ge', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'In', 'Ir', 'Kr', 'La', 'Li',
177
+ 'Lr', 'Lu', 'Lv', 'Mc', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'Na', 'Nb', 'Nd', 'Ne', 'Nh', 'Ni', 'No',
178
+ 'Np', 'Og', 'Os', 'Pa', 'Pb', 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg',
179
+ 'Rh', 'Rn', 'Ru', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th',
180
+ 'Ti', 'Tl', 'Tm', 'Ts', 'Xe', 'Yb', 'Zn', 'Zr'
181
+ ]
182
+ _single_letter_elements = [
183
+ "B", "C", "F", "H", "I", "K", "N", "O", "P", "S", "U", "V", "W", "Y", 'b', 'c', 'n', 'o', 'p', 's'
184
+ ]
185
+ all_elements_sorted = sorted(_two_letter_elements + _single_letter_elements, key=lambda x: (-len(x), x))
186
+ elements_pattern_str = "|".join(all_elements_sorted)
187
+
188
+ bracket_atom_pattern_str = r"\[[^\]]+\]"
189
+ other_single_chars_pattern_str = r"[\(\)\.=\-#@\d\$\%\*:\+\-\/\\]"
190
+ smiles_unit_pattern = (
191
+ r"(?:"
192
+ + bracket_atom_pattern_str
193
+ + r"|"
194
+ + elements_pattern_str
195
+ + r"|"
196
+ + other_single_chars_pattern_str
197
+ + r")"
198
+ )
199
+ core_sequence_pattern = rf"(?>{smiles_unit_pattern}){{10,}}"
200
+ constrained_core_sequence_pattern = rf"(?![:.=]){core_sequence_pattern}(?<![:.=])"
201
+
202
+ final_regex_str = rf"({constrained_core_sequence_pattern})"
203
+
204
+ COMPILED_REGEX = re.compile(final_regex_str)
205
+ return COMPILED_REGEX
206
+
207
+ def check_legitimacy_slow(self, candidate: str) -> bool:
208
+ """Check legitimacy with RDKit"""
209
+ if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
210
+ return False
211
+
212
+ mol = Chem.MolFromSmiles(candidate)
213
+ if mol is None:
214
+ return False
215
+ else:
216
+ return True
217
+
218
+ def check_legitimacy_fast(self, candidate: str) -> bool:
219
+ """Check legitimacy with hard rules"""
220
+ if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
221
+ return False
222
+
223
+ if not self.check_rings_and_brackets(candidate):
224
+ return False
225
+ else:
226
+ return True
227
+
228
+ def check_legitimacy(self, candidate: str) -> bool:
229
+ if RDKIT_AVAILABLE:
230
+ return self.check_legitimacy_slow(candidate)
231
+ else:
232
+ return self.check_legitimacy_fast(candidate)
233
+
234
+ def check_brackets(self, text):
235
+ matches = re.findall(r"\[([^\[\]]*)\]", text)
236
+ for part in matches:
237
+ if "(" in part or ")" in part:
238
+ return False
239
+ if len(part) == 0:
240
+ return False
241
+ if part[0] in elements or part[0] in aromatic_symbols or part[:2] in elements:
242
+ return True
243
+ return True
244
+
245
+ def check_rings_and_brackets(self, text):
246
+ rings = {}
247
+ left_sq_bracket, right_sq_bracket = 0, 0
248
+ left_pt_bracket, right_pt_bracket = 0, 0
249
+ all_lower = True
250
+ digits_cnt = 0
251
+ pos = 0
252
+ while pos < len(text):
253
+ step = 0
254
+ c = text[pos]
255
+ if ord(c) >= 65 and ord(c) <= 90:
256
+ all_lower = False
257
+ if (pos == len(text) - 1 or pos == 0) and c in bonds:
258
+ return False
259
+ if pos > 0 and text[pos - 1] in bonds and text[pos] in bonds:
260
+ return False
261
+ if c == "[":
262
+ step = 1
263
+ left_sq_bracket += 1
264
+ if left_sq_bracket > right_sq_bracket + 1:
265
+ return False
266
+ if pos == len(text)-1:
267
+ return False
268
+ if ']' not in text[pos+1:]:
269
+ return False
270
+ bracket_span = text[pos+1:text.find(']')]
271
+
272
+ if self._SQ_BRACKET_BAN_1.search(bracket_span) or self._SQ_BRACKET_BAN_2.search(bracket_span):
273
+ return False
274
+
275
+ matches = re.findall(r'\d+', bracket_span)
276
+ if len(matches)>2:
277
+ return False
278
+ if c == "]":
279
+ step = 1
280
+ right_sq_bracket += 1
281
+ if right_sq_bracket > left_sq_bracket:
282
+ return False
283
+
284
+ if c == "(":
285
+ step = 1
286
+ left_pt_bracket += 1
287
+ if c == ")":
288
+ step = 1
289
+ right_pt_bracket += 1
290
+ if right_pt_bracket > left_pt_bracket:
291
+ return False
292
+
293
+ if left_sq_bracket == right_sq_bracket:
294
+ if c.isdigit():
295
+ digits_cnt += 1
296
+ step = 1
297
+ if (
298
+ pos == 0
299
+ or (pos == 1 and text[pos - 1] != "%")
300
+ or (pos > 1 and text[pos - 1] != "%" and text[pos - 2] != "%")
301
+ ):
302
+ if c in rings:
303
+ if rings[c] == "unclosed":
304
+ rings[c] = "closed"
305
+ else:
306
+ rings[c] = "unclosed"
307
+ else:
308
+ rings[c] = "unclosed"
309
+ if c == "%":
310
+ if pos >= len(text) - 2 or not text[pos + 1].isdigit() or not text[pos + 2].isdigit():
311
+ return False
312
+ step = 3
313
+ digits_cnt += 1
314
+ num = text[pos + 1 : pos + 3]
315
+ if num in rings:
316
+ if rings[num] == "unclosed":
317
+ rings[num] = "closed"
318
+ else:
319
+ rings[num] = "unclosed"
320
+ else:
321
+ rings[num] = "unclosed"
322
+ if step == 0:
323
+ if (
324
+ pos < len(text) - 1
325
+ and text[pos : pos + 2] in organic_symbols + aromatic_symbols + other_allows
326
+ ):
327
+ step = 2
328
+ elif c in organic_symbols + aromatic_symbols + other_allows:
329
+ step = 1
330
+ else:
331
+ return False
332
+
333
+ if step == 0:
334
+ step = 1
335
+ pos += step
336
+
337
+ if left_sq_bracket != right_sq_bracket or any(v == "unclosed" for v in rings.values()):
338
+ return False
339
+ if all_lower and digits_cnt < 2:
340
+ return False
341
+ return self.check_brackets(text)
342
+
343
+
344
+ class InternS1Tokenizer(Qwen2Tokenizer):
345
+ """
346
+ Construct an InternS1 tokenizer. Based on byte-level Byte-Pair-Encoding.
347
+
348
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
349
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
350
+
351
+ ```python
352
+ >>> from transformers import AutoTokenizer
353
+
354
+ >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
355
+ >>> tokenizer("Hello world")["input_ids"]
356
+ [9707, 1879]
357
+
358
+ >>> tokenizer(" Hello world")["input_ids"]
359
+ [21927, 1879]
360
+ ```
361
+ This is expected.
362
+
363
+ Include custom extension to support better domain-specific text tokenization, leveraging a separately trained tokenizer model.
364
+ Users should refer to this superclass [`PreTrainedTokenizer`] for more information regarding those overloaded methods
365
+
366
+ Args:
367
+ vocab_file (`str`):
368
+ Path to the vocabulary file.
369
+ merges_file (`str`):
370
+ Path to the merges file.
371
+ errors (`str`, *optional*, defaults to `"replace"`):
372
+ Paradigm to follow when decoding bytes to UTF-8. See
373
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
374
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
375
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
376
+ token instead.
377
+ bos_token (`str`, *optional*):
378
+ The beginning of sequence token. Not applicable for this tokenizer.
379
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
380
+ The end of sequence token.
381
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
382
+ The token used for padding, for example when batching sequences of different lengths.
383
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
384
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
385
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
386
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
387
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
388
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
389
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
390
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
391
+ """
392
+
393
+ vocab_files_names = VOCAB_FILES_NAMES
394
+ model_input_names = ["input_ids", "attention_mask"]
395
+
396
+ def __init__(
397
+ self,
398
+ vocab_file,
399
+ merges_file,
400
+ errors="replace",
401
+ unk_token="<|endoftext|>",
402
+ bos_token=None,
403
+ eos_token="<|endoftext|>",
404
+ pad_token="<|endoftext|>",
405
+ clean_up_tokenization_spaces=False,
406
+ split_special_tokens=False,
407
+ **kwargs,
408
+ ):
409
+ self.extra_tokenizer_start_mapping = {}
410
+ self.extra_tokenizer_end_mapping = {}
411
+ self._extra_special_tokens = []
412
+
413
+ self._extra_tokenizer_list = [
414
+ dict(
415
+ tokenizer_name="tokenizer_SMILES",
416
+ tokenizer_path=os.path.join(os.path.dirname(vocab_file), "tokenizer_SMILES.model"),
417
+ begin_sp_tokens=["<SMILES>", "<SELFIES>"],
418
+ end_sp_tokens=["</SMILES>", "</SELFIES>"],
419
+ auto_begin_sp_tokens=["<SMILES_AUTO_DETECT>"],
420
+ auto_end_sp_tokens=["</SMILES_AUTO_DETECT>"],
421
+ ),
422
+ dict(
423
+ tokenizer_name="tokenizer_IUPAC",
424
+ tokenizer_path=os.path.join(os.path.dirname(vocab_file), "tokenizer_IUPAC.model"),
425
+ begin_sp_tokens=["<IUPAC>"],
426
+ end_sp_tokens=["</IUPAC>"],
427
+ auto_begin_sp_tokens=[],
428
+ auto_end_sp_tokens=[],
429
+ ),
430
+ dict(
431
+ tokenizer_name="tokenizer_FASTA",
432
+ tokenizer_path=os.path.join(os.path.dirname(vocab_file), "tokenizer_FASTA.model"),
433
+ begin_sp_tokens=[],
434
+ end_sp_tokens=[],
435
+ auto_begin_sp_tokens=["<FASTA_AUTO_DETECT>"],
436
+ auto_end_sp_tokens=["</FASTA_AUTO_DETECT>"],
437
+ ),
438
+ ]
439
+ # Content wrapped in these sp tokens won't be tokenized
440
+ self.protect_begin_sp_tokens = ["<MOLFORMULA>"]
441
+ self.protect_end_sp_tokens = ["</MOLFORMULA>"]
442
+
443
+ self.auto_begin_sp_tokens = []
444
+ self.auto_end_sp_tokens = []
445
+
446
+ self._unk_token = "<unk>" # Fall-back
447
+
448
+ self.new_sp_token_offset = [26] # The length of sp token before the start of extra vocab
449
+ self.tokenizer_mapping = OrderedDict()
450
+
451
+ super().__init__(
452
+ vocab_file=vocab_file,
453
+ merges_file=merges_file,
454
+ errors=errors,
455
+ unk_token=unk_token,
456
+ bos_token=bos_token,
457
+ eos_token=eos_token,
458
+ pad_token=pad_token,
459
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
460
+ split_special_tokens=split_special_tokens,
461
+ **kwargs,
462
+ )
463
+
464
+ # keep order for python < 3.7
465
+ self.tokenizer_mapping = OrderedDict([("tokenizer_original", self.encoder)])
466
+
467
+ if self._extra_tokenizer_list is not None:
468
+ for tokenizer_config in self._extra_tokenizer_list:
469
+ self._build_extra_tokenizer(tokenizer_config)
470
+ self._update_special_tokens(tokenizer_config)
471
+ self._update_logical_special_tokens(tokenizer_config)
472
+ self.decoder.update(self._build_extra_decoder(tokenizer_config))
473
+
474
+ for token in self.protect_begin_sp_tokens:
475
+ self.tokens_trie.add(token)
476
+
477
+ for token in self.protect_end_sp_tokens:
478
+ self.tokens_trie.add(token)
479
+
480
+ self.new_sp_token_offset.append(len(self._added_tokens_decoder) - sum(self.new_sp_token_offset) + len(self._extra_special_tokens))
481
+ self.check_module_list = [SmilesCheckModule(), FastaCheckModule()]
482
+
483
+ @property
484
+ def vocab_size(self) -> int:
485
+ """Returns vocab size including extra tokenizer"""
486
+ total_vocab_size = len(self.encoder)
487
+ for tokenizer in self.tokenizer_mapping.values():
488
+ if isinstance(tokenizer, dict):
489
+ continue
490
+ else:
491
+ total_vocab_size += tokenizer.get_piece_size()
492
+ return total_vocab_size + sum(self.new_sp_token_offset)
493
+
494
+ def __len__(self) -> int:
495
+ """Overload method"""
496
+ return self.vocab_size
497
+
498
+ @property
499
+ def logical_auto_tokens(self):
500
+ """Tokens that won't be decoded and only for switching tokenizer"""
501
+ return self.auto_begin_sp_tokens + self.auto_end_sp_tokens
502
+
503
+ @property
504
+ def extra_tokenizer_bos_keys(self):
505
+ return self.extra_tokenizer_start_mapping.keys()
506
+
507
+ @property
508
+ def extra_tokenizer_eos_keys(self):
509
+ return self.extra_tokenizer_end_mapping.keys()
510
+
511
+ @property
512
+ def protect_sp_tokens(self):
513
+ """Content wrapped by these sp tokens won't apply extra tokenizer"""
514
+ return self.protect_begin_sp_tokens + self.protect_end_sp_tokens
515
+
516
+ def _build_extra_tokenizer(self, tokenizer_config: dict) -> None:
517
+ """
518
+ Build domain-specific tokenizers
519
+ and register them in tokenizer_mapping
520
+ """
521
+ _sp_model = spm.SentencePieceProcessor()
522
+ _sp_model.Load(tokenizer_config["tokenizer_path"])
523
+ self.tokenizer_mapping.update({tokenizer_config["tokenizer_name"]: _sp_model})
524
+
525
+ for begin_sp_token, end_sp_token in zip(
526
+ tokenizer_config["begin_sp_tokens"], tokenizer_config["end_sp_tokens"]
527
+ ):
528
+ self.extra_tokenizer_start_mapping.update({begin_sp_token: tokenizer_config["tokenizer_name"]})
529
+ self.extra_tokenizer_end_mapping.update({end_sp_token: tokenizer_config["tokenizer_name"]})
530
+
531
+ for begin_sp_token, end_sp_token in zip(
532
+ tokenizer_config["auto_begin_sp_tokens"], tokenizer_config["auto_end_sp_tokens"]
533
+ ):
534
+ self.extra_tokenizer_start_mapping.update({begin_sp_token: tokenizer_config["tokenizer_name"]})
535
+ self.extra_tokenizer_end_mapping.update({end_sp_token: tokenizer_config["tokenizer_name"]})
536
+
537
+ def _build_extra_decoder(self, tokenizer_config: dict) -> Dict[int, str]:
538
+ """Build domain-specific tokenizers' decoder"""
539
+ extra_decoder = {}
540
+ sp_model = self.tokenizer_mapping[tokenizer_config["tokenizer_name"]]
541
+ start_pos = self.vocab_size - sp_model.get_piece_size() - self.new_sp_token_offset[-1]
542
+ extra_decoder.update(
543
+ {i: sp_model.id_to_piece(i - start_pos) for i in range(start_pos, start_pos + sp_model.get_piece_size())}
544
+ )
545
+ return extra_decoder
546
+
547
+ def _update_logical_special_tokens(self, tokenizer_config: dict) -> None:
548
+ """Update logical special tokens which serve as special token and won't be mapped to a specific token id"""
549
+ for begin_sp_token, end_sp_token in zip(
550
+ tokenizer_config["auto_begin_sp_tokens"], tokenizer_config["auto_end_sp_tokens"]
551
+ ):
552
+ self.auto_begin_sp_tokens.append(begin_sp_token)
553
+ self.auto_end_sp_tokens.append(end_sp_token)
554
+
555
+ self.tokens_trie.add(begin_sp_token)
556
+ self.tokens_trie.add(end_sp_token)
557
+
558
+ def _update_special_tokens(self, tokenizer_config: dict):
559
+ """Update special tokens for each modality"""
560
+ offset = sum(self.new_sp_token_offset[1:]) + len(self.logical_auto_tokens)
561
+ new_offset = 0
562
+ for start_key, end_key in zip(
563
+ list(self.extra_tokenizer_bos_keys)[offset // 2 :], list(self.extra_tokenizer_eos_keys)[offset // 2 :]
564
+ ):
565
+ self.tokens_trie.add(start_key)
566
+
567
+ if start_key not in tokenizer_config["auto_begin_sp_tokens"]:
568
+ self._added_tokens_encoder.update({start_key: self.vocab_size + new_offset})
569
+ self._added_tokens_decoder.update(
570
+ {
571
+ self.vocab_size + new_offset: AddedToken(
572
+ content=start_key,
573
+ lstrip=False,
574
+ normalized=False,
575
+ rstrip=False,
576
+ single_word=False,
577
+ special=True,
578
+ )
579
+ }
580
+ )
581
+ self.tokens_trie.add(start_key)
582
+ new_offset += 1
583
+
584
+ if end_key not in tokenizer_config["auto_end_sp_tokens"]:
585
+ self._added_tokens_encoder.update({end_key: self.vocab_size + new_offset})
586
+ self._added_tokens_decoder.update(
587
+ {
588
+ self.vocab_size + new_offset: AddedToken(
589
+ content=end_key,
590
+ lstrip=False,
591
+ normalized=False,
592
+ rstrip=False,
593
+ single_word=False,
594
+ special=True,
595
+ )
596
+ }
597
+ )
598
+ self.tokens_trie.add(end_key)
599
+ new_offset += 1
600
+ self.new_sp_token_offset.append(new_offset)
601
+
602
+ @lru_cache(maxsize=None) # May cause memory leak
603
+ def _extra_tokenizer_offset(self, tokenizer_key) -> int:
604
+ offset = 0
605
+ for index, (tokenizer_name, tokenizer) in enumerate(self.tokenizer_mapping.items()):
606
+ if tokenizer_name == tokenizer_key:
607
+ break
608
+ else:
609
+ offset += len(tokenizer) + self.new_sp_token_offset[index]
610
+ return offset
611
+
612
+ def _pop_logical_sp_token(self, extra_tokenizer_stack: list, mapping_name: str) -> None:
613
+ """Switch tokenizer when it comes to an end sp token"""
614
+ extra_tokenizer_end_mapping = extra_tokenizer_stack.pop()
615
+ if extra_tokenizer_end_mapping != self.extra_tokenizer_end_mapping[mapping_name]:
616
+ logger.warning_once(
617
+ f"Encounter incorrect nesting of extra tokenizer: {self.extra_tokenizer_end_mapping[mapping_name]} and {extra_tokenizer_end_mapping}"
618
+ )
619
+ logger.warning_once("This may lead to unexpected behaviour of the tokenizer, please check your input.")
620
+
621
+ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
622
+ """
623
+ Converts a string into a sequence of tokens, using the tokenizer.
624
+
625
+ It will switch to domain-specific tokenizer once encountering extra/logical sp tokens.
626
+
627
+ Args:
628
+ text: TextInput
629
+ """
630
+ split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
631
+
632
+ text, kwargs = self.prepare_for_tokenization(text, **kwargs)
633
+
634
+ if kwargs:
635
+ logger.warning(f"Keyword arguments {kwargs} not recognized.")
636
+
637
+ if hasattr(self, "do_lower_case") and self.do_lower_case:
638
+ # convert non-special tokens to lowercase. Might be super slow as well?
639
+ escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
640
+ escaped_special_toks += [
641
+ re.escape(s_tok.content)
642
+ for s_tok in (self._added_tokens_decoder.values())
643
+ if not s_tok.special and s_tok.normalized
644
+ ]
645
+ pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
646
+ text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
647
+
648
+ if split_special_tokens:
649
+ no_split_token = []
650
+ tokens = [text]
651
+ else:
652
+ no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens
653
+ # "This is something<special_token_1> else"
654
+ tokens = self.tokens_trie.split(text)
655
+
656
+ # ["This is something", "<special_token_1>", " else"]
657
+ for i, token in enumerate(tokens):
658
+ if token in no_split_token:
659
+ tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
660
+ left = tokens[i - 1] if i > 0 else None
661
+ right = tokens[i + 1] if i < len(tokens) - 1 else None
662
+ if isinstance(tok_extended, AddedToken):
663
+ if tok_extended.rstrip and right:
664
+ # A bit counter-intuitive but we strip the left of the string
665
+ # since tok_extended.rstrip means the special token is eating all white spaces on its right
666
+ tokens[i + 1] = right.lstrip()
667
+ # Strip white spaces on the left
668
+ if tok_extended.lstrip and left:
669
+ tokens[i - 1] = left.rstrip() # Opposite here
670
+ if tok_extended.single_word and left and left[-1] != " ":
671
+ tokens[i - 1] += token
672
+ tokens[i] = ""
673
+ elif tok_extended.single_word and right and right[0] != " ":
674
+ tokens[i + 1] = token + tokens[i + 1]
675
+ tokens[i] = ""
676
+ else:
677
+ raise ValueError(
678
+ f"{tok_extended} cannot be tokenized because it was not properly added"
679
+ f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
680
+ )
681
+
682
+ # ["This is something", "<special_token_1>", "else"]
683
+ tokenized_text = []
684
+
685
+ # Code for Auto Detect
686
+ if self._extra_tokenizer_list is not None:
687
+ new_tokens = []
688
+ not_split_flag = 0
689
+ for token in tokens:
690
+ if not token:
691
+ continue
692
+ if token in no_split_token or token in self.protect_sp_tokens:
693
+ new_tokens.append(token)
694
+ if token in self.extra_tokenizer_bos_keys or token in self.protect_begin_sp_tokens:
695
+ not_split_flag += 1 # In case nested sp tokens
696
+ elif token in self.extra_tokenizer_eos_keys or token in self.protect_end_sp_tokens:
697
+ not_split_flag = max(0, not_split_flag - 1)
698
+ else:
699
+ if not_split_flag:
700
+ new_tokens.append(token)
701
+ else:
702
+ for check_module in self.check_module_list:
703
+ token = check_module.re_split(token)
704
+
705
+ new_tokens.extend(token)
706
+ tokens = new_tokens
707
+
708
+ extra_tokenizer_stack = [] # This should be a stack to handle nested extra tokenizer
709
+
710
+ for token in tokens:
711
+ # Need to skip eventual empty (fully stripped) tokens
712
+ if not token:
713
+ continue
714
+ if token in self.protect_sp_tokens:
715
+ tokenized_text.extend(self._tokenize(token))
716
+ elif token in no_split_token:
717
+ tokenized_text.append(token)
718
+ if token in self.extra_tokenizer_bos_keys:
719
+ extra_tokenizer_stack.append(self.extra_tokenizer_start_mapping[token])
720
+ elif token in self.extra_tokenizer_eos_keys:
721
+ if extra_tokenizer_stack:
722
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
723
+ elif token in self.auto_begin_sp_tokens:
724
+ tokenized_text.append(token)
725
+ extra_tokenizer_stack.append(self.extra_tokenizer_start_mapping[token])
726
+ elif token in self.auto_end_sp_tokens:
727
+ tokenized_text.append(token)
728
+ if extra_tokenizer_stack:
729
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
730
+ else:
731
+ tokenized_text.extend(self._tokenize(token, extra_tokenizer_stack=extra_tokenizer_stack))
732
+
733
+ # ["This", " is", " something", "<special_token_1>", "else"]
734
+ return tokenized_text
735
+
736
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
737
+ """
738
+ Modified from `transformers.tokenization_utils._add_tokens`.
739
+
740
+ This adaptation supports dynamic tokenizer length due to supplementary tokenizers (e.g., domain-specific or scientific text tokenizers).
741
+ """
742
+ added_tokens = 0
743
+ if new_tokens is None:
744
+ return added_tokens
745
+ # TODO this is fairly slow to improve!
746
+ current_vocab = self.get_vocab().copy()
747
+ new_idx = max(current_vocab.values()) + 1
748
+
749
+ for token in new_tokens:
750
+ if not isinstance(token, (str, AddedToken)):
751
+ raise TypeError(f"Token {token} is not a string but a {type(token)}.")
752
+ if str(token) == "":
753
+ continue
754
+ if isinstance(token, str):
755
+ if token in self._added_tokens_encoder:
756
+ continue
757
+ else:
758
+ # very important for fast and slow equivalence!
759
+ is_special = token in self.all_special_tokens or special_tokens
760
+ token = AddedToken(
761
+ token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special
762
+ )
763
+ elif special_tokens:
764
+ # doing token.special=True changes the normalization! will fix in rust
765
+ # this is important and the only reason why the AddedTokens in each class are normalized by default
766
+ token.__setstate__({"special": True, "normalized": token.normalized})
767
+ if token in self._added_tokens_decoder:
768
+ continue
769
+ if not token.special and token.normalized and getattr(self, "do_lower_case", False):
770
+ # Normalize if requested
771
+ token.content = token.content.lower()
772
+ if token.content not in current_vocab:
773
+ token_index = new_idx + added_tokens
774
+ current_vocab[token.content] = token_index
775
+ added_tokens += 1
776
+ self._extra_special_tokens.append(token)
777
+ else:
778
+ token_index = current_vocab[token.content]
779
+ if token.special and str(token) not in self.all_special_tokens:
780
+ self._special_tokens_map["additional_special_tokens"].append(token)
781
+ # the setter automatically updates the reverse map
782
+ self._added_tokens_decoder[token_index] = token
783
+ self._added_tokens_encoder[token.content] = token_index
784
+ if self.verbose:
785
+ logger.info(f"Adding {token} to the vocabulary")
786
+ self._update_trie()
787
+ self._update_total_vocab_size()
788
+
789
+ if added_tokens and self.tokenizer_mapping:
790
+ self.new_sp_token_offset.append(added_tokens)
791
+
792
+ return added_tokens
793
+
794
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
795
+ def _tokenize(self, text, **kwargs):
796
+ """
797
+ Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize`.
798
+
799
+ This adaptation supports domain-specific tokenizers.
800
+ """
801
+ extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
802
+ if extra_tokenizer_stack:
803
+ tokenized_text = self.tokenizer_mapping[extra_tokenizer_stack[-1]].encode(text, out_type=str)
804
+ tokenized_id = self.tokenizer_mapping[extra_tokenizer_stack[-1]].encode(text, out_type=int)
805
+ final_tokenized_text = []
806
+ for text_piece, id_piece in zip(tokenized_text, tokenized_id):
807
+ if id_piece == 0:
808
+ final_tokenized_text.extend(self._bpe_tokenize(text_piece))
809
+ else:
810
+ final_tokenized_text.append(text_piece)
811
+ return final_tokenized_text
812
+ else:
813
+ return self._bpe_tokenize(text)
814
+
815
+ def _bpe_tokenize(self, text, **kwargs):
816
+ text = text.replace(
817
+ "▁", " "
818
+ ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
819
+ bpe_tokens = []
820
+ for token in re.findall(self.pat, text):
821
+ token = "".join(
822
+ self.byte_encoder[b] for b in token.encode("utf-8")
823
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
824
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
825
+ return bpe_tokens
826
+
827
+ def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
828
+ """
829
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier.convert_tokens_to_ids`.
830
+
831
+ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
832
+ vocabulary.
833
+
834
+ This adaptation supports domain-specific tokenizers.
835
+
836
+ Args:
837
+ tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
838
+
839
+ Returns:
840
+ `int` or `List[int]`: The token id or list of token ids.
841
+ """
842
+ if tokens is None:
843
+ return None
844
+
845
+ if isinstance(tokens, str):
846
+ return self._convert_token_to_id_with_added_voc(tokens)
847
+
848
+ ids = []
849
+ extra_tokenizer_stack = []
850
+
851
+ for token in tokens:
852
+ if token not in self.logical_auto_tokens:
853
+ ids.append(
854
+ self._convert_token_to_id_with_added_voc(token, extra_tokenizer_stack=extra_tokenizer_stack)
855
+ )
856
+ if token in self.extra_tokenizer_bos_keys:
857
+ extra_tokenizer_stack.append(self.extra_tokenizer_start_mapping[token])
858
+ elif token in self.extra_tokenizer_eos_keys:
859
+ if extra_tokenizer_stack:
860
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
861
+ return ids
862
+
863
+ def _convert_token_to_id_with_added_voc(self, token, **kwargs):
864
+ """
865
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id_with_added_voc`.
866
+
867
+ This adaptation supports domain-specific tokenizers.
868
+ """
869
+ if token is None:
870
+ return None
871
+
872
+ if token in self._added_tokens_encoder:
873
+ return self._added_tokens_encoder[token]
874
+ return self._convert_token_to_id(token, **kwargs)
875
+
876
+ def _convert_token_to_id(self, token, **kwargs):
877
+ """
878
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id`.
879
+
880
+ Converts a token (str) in an id using the vocab.
881
+
882
+ Fall back to original tokenizer once OOV.
883
+ """
884
+ extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
885
+ if extra_tokenizer_stack:
886
+ token_id = self.tokenizer_mapping[extra_tokenizer_stack[-1]].piece_to_id(token)
887
+ if token_id == self.tokenizer_mapping[extra_tokenizer_stack[-1]].unk_id():
888
+ return self.encoder.get(token, self.encoder.get(self._unk_token))
889
+ else:
890
+ return token_id + self._extra_tokenizer_offset(extra_tokenizer_stack[-1])
891
+ else:
892
+ return self.encoder.get(token, self.encoder.get(self._unk_token))
893
+
894
+ def convert_tokens_to_string(self, tokens):
895
+ """Converts a sequence of tokens (string) in a single string."""
896
+ text = "".join(tokens)
897
+ text = text.replace(
898
+ "▁", "Ġ"
899
+ ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
900
+ text = text.replace("\n", "Ċ")
901
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
902
+ return text
903
+
904
+ def decode(
905
+ self,
906
+ token_ids,
907
+ skip_special_tokens: bool = False,
908
+ clean_up_tokenization_spaces: Optional[bool] = False,
909
+ spaces_between_special_tokens: bool = False,
910
+ **kwargs,
911
+ ) -> str:
912
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
913
+ # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
914
+ return super().decode(
915
+ token_ids,
916
+ skip_special_tokens=skip_special_tokens,
917
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
918
+ spaces_between_special_tokens=spaces_between_special_tokens,
919
+ **kwargs,
920
+ )
921
+
922
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
923
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
924
+ """
925
+ Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary` to support saving custom extension.
926
+ """
927
+ if not os.path.isdir(save_directory):
928
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
929
+ return
930
+ vocab_file = os.path.join(
931
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
932
+ )
933
+ merge_file = os.path.join(
934
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
935
+ )
936
+ sp_model_smiles = os.path.join(
937
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_SMILES"]
938
+ )
939
+ sp_model_iupac = os.path.join(
940
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_IUPAC"]
941
+ )
942
+ sp_model_fasta = os.path.join(
943
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_FASTA"]
944
+ )
945
+
946
+ with open(vocab_file, "w", encoding="utf-8") as f:
947
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
948
+
949
+ index = 0
950
+ with open(merge_file, "w", encoding="utf-8") as writer:
951
+ writer.write("#version: 0.2\n")
952
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
953
+ if index != token_index:
954
+ logger.warning(
955
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
956
+ " Please check that the tokenizer is not corrupted!"
957
+ )
958
+ index = token_index
959
+ writer.write(" ".join(bpe_tokens) + "\n")
960
+ index += 1
961
+
962
+ with open(sp_model_smiles, "wb") as f:
963
+ f.write(self.tokenizer_mapping["tokenizer_SMILES"].serialized_model_proto())
964
+
965
+ with open(sp_model_iupac, "wb") as f:
966
+ f.write(self.tokenizer_mapping["tokenizer_IUPAC"].serialized_model_proto())
967
+
968
+ with open(sp_model_fasta, "wb") as f:
969
+ f.write(self.tokenizer_mapping["tokenizer_FASTA"].serialized_model_proto())
970
+
971
+ return vocab_file, merge_file
972
+
973
+
974
+ __all__ = ["InternS1Tokenizer"]
tokenizer_FASTA.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e719023a50767e2da1165925feb3afe77d63702f08d0cd39c4ddadba7cdaaca
3
+ size 5899
tokenizer_IUPAC.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e719023a50767e2da1165925feb3afe77d63702f08d0cd39c4ddadba7cdaaca
3
+ size 5899
tokenizer_SMILES.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dd3252680ab2fedacab7e71b75a48f08d6fbae70a9cc38d355c65ec42fbd0d
3
+ size 3290
tokenizer_config.json ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151925": {
215
+ "content": "<SMILES>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151926": {
223
+ "content": "</SMILES>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151927": {
231
+ "content": "<SELFIES>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151928": {
239
+ "content": "</SELFIES>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "152441": {
247
+ "content": "<IUPAC>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "152442": {
255
+ "content": "</IUPAC>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "152955": {
263
+ "content": "<FASTA>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "152956": {
271
+ "content": "</FASTA>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "152957": {
279
+ "content": "<IMG_CONTEXT>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "152958": {
287
+ "content": "<img>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "152959": {
295
+ "content": "</img>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "152960": {
303
+ "content": "<quad>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "152961": {
311
+ "content": "</quad>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ },
318
+ "152962": {
319
+ "content": "<ref>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": true
325
+ },
326
+ "152963": {
327
+ "content": "</ref>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": true
333
+ },
334
+ "152964": {
335
+ "content": "<box>",
336
+ "lstrip": false,
337
+ "normalized": false,
338
+ "rstrip": false,
339
+ "single_word": false,
340
+ "special": true
341
+ },
342
+ "152965": {
343
+ "content": "</box>",
344
+ "lstrip": false,
345
+ "normalized": false,
346
+ "rstrip": false,
347
+ "single_word": false,
348
+ "special": true
349
+ },
350
+ "152966": {
351
+ "content": "<|action_start|>",
352
+ "lstrip": false,
353
+ "normalized": false,
354
+ "rstrip": false,
355
+ "single_word": false,
356
+ "special": true
357
+ },
358
+ "152967": {
359
+ "content": "<|action_end|>",
360
+ "lstrip": false,
361
+ "normalized": false,
362
+ "rstrip": false,
363
+ "single_word": false,
364
+ "special": true
365
+ },
366
+ "152968": {
367
+ "content": "<|interpreter|>",
368
+ "lstrip": false,
369
+ "normalized": false,
370
+ "rstrip": false,
371
+ "single_word": false,
372
+ "special": true
373
+ },
374
+ "152969": {
375
+ "content": "<|plugin|>",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false,
380
+ "special": true
381
+ },
382
+ "152970": {
383
+ "content": "<video>",
384
+ "lstrip": false,
385
+ "normalized": false,
386
+ "rstrip": false,
387
+ "single_word": false,
388
+ "special": true
389
+ }
390
+ },
391
+ "additional_special_tokens": [
392
+ "<|im_start|>",
393
+ "<|im_end|>",
394
+ "<|object_ref_start|>",
395
+ "<|object_ref_end|>",
396
+ "<|box_start|>",
397
+ "<|box_end|>",
398
+ "<|quad_start|>",
399
+ "<|quad_end|>",
400
+ "<|vision_start|>",
401
+ "<|vision_end|>",
402
+ "<|vision_pad|>",
403
+ "<|image_pad|>",
404
+ "<|video_pad|>"
405
+ ],
406
+ "bos_token": null,
407
+ "clean_up_tokenization_spaces": false,
408
+ "context_image_token": "<IMG_CONTEXT>",
409
+ "end_image_token": "</img>",
410
+ "eos_token": "<|im_end|>",
411
+ "errors": "replace",
412
+ "extra_special_tokens": {
413
+ "context_image_token": "<IMG_CONTEXT>",
414
+ "end_image_token": "</img>",
415
+ "start_image_token": "<img>",
416
+ "video_token": "<video>"
417
+ },
418
+ "model_max_length": 32768,
419
+ "pad_token": "<|endoftext|>",
420
+ "processor_class": "InternS1Processor",
421
+ "split_special_tokens": false,
422
+ "start_image_token": "<img>",
423
+ "tokenizer_class": "InternS1Tokenizer",
424
+ "unk_token": null,
425
+ "video_token": "<video>",
426
+ "auto_map": {
427
+ "AutoTokenizer": [
428
+ "tokenization_interns1.InternS1Tokenizer",
429
+ "tokenization_interns1.InternS1Tokenizer"
430
+ ]
431
+ }
432
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "crop_to_patches": false,
4
+ "data_format": "channels_first",
5
+ "default_to_square": true,
6
+ "device": null,
7
+ "disable_grouping": null,
8
+ "do_center_crop": null,
9
+ "do_convert_rgb": true,
10
+ "do_normalize": true,
11
+ "do_pad": null,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "do_sample_frames": false,
15
+ "fps": null,
16
+ "image_mean": [
17
+ 0.485,
18
+ 0.456,
19
+ 0.406
20
+ ],
21
+ "image_processor_type": "GotOcr2ImageProcessorFast",
22
+ "image_std": [
23
+ 0.229,
24
+ 0.224,
25
+ 0.225
26
+ ],
27
+ "initial_shift": true,
28
+ "input_data_format": null,
29
+ "max_patches": 12,
30
+ "min_patches": 1,
31
+ "num_frames": null,
32
+ "processor_class": "InternS1Processor",
33
+ "resample": 3,
34
+ "rescale_factor": 0.00392156862745098,
35
+ "return_tensors": null,
36
+ "size": {
37
+ "height": 448,
38
+ "width": 448
39
+ },
40
+ "size_divisor": null,
41
+ "video_metadata": null,
42
+ "video_processor_type": "InternS1VideoProcessor",
43
+ "auto_map": {
44
+ "AutoVideoProcessor": "video_processing_interns1.InternS1VideoProcessor"
45
+ }
46
+ }
video_processing_interns1.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Fast Video processor class for InternS1."""
16
+
17
+ from typing import Optional, Union
18
+
19
+ from transformers.image_processing_utils import BatchFeature
20
+ from transformers.image_utils import (
21
+ OPENAI_CLIP_MEAN,
22
+ OPENAI_CLIP_STD,
23
+ SizeDict,
24
+ )
25
+ from transformers.processing_utils import Unpack, VideosKwargs
26
+ from transformers.utils import (
27
+ TensorType,
28
+ is_torch_available,
29
+ is_torchvision_available,
30
+ is_torchvision_v2_available,
31
+ is_vision_available,
32
+ )
33
+ from transformers.utils.import_utils import requires
34
+ from transformers.video_processing_utils import BaseVideoProcessor
35
+ from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
36
+
37
+
38
+ if is_torchvision_available():
39
+ if is_torchvision_v2_available():
40
+ from torchvision.transforms.v2 import functional as F
41
+ else:
42
+ from torchvision.transforms import functional as F
43
+
44
+
45
+ if is_torch_available():
46
+ import torch
47
+
48
+ if is_vision_available():
49
+ from transformers.image_utils import PILImageResampling
50
+
51
+
52
+ class InternS1VideoProcessorInitKwargs(VideosKwargs):
53
+ initial_shift: Union[bool, float, int]
54
+
55
+
56
+ @requires(backends=("torchvision",))
57
+ class InternS1VideoProcessor(BaseVideoProcessor):
58
+ resample = PILImageResampling.BICUBIC
59
+ image_mean = OPENAI_CLIP_MEAN
60
+ image_std = OPENAI_CLIP_STD
61
+ size = {"height": 384, "width": 384}
62
+ do_resize = True
63
+ do_rescale = True
64
+ do_normalize = True
65
+ do_convert_rgb = True
66
+ initial_shift = True
67
+ do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
68
+ valid_kwargs = InternS1VideoProcessorInitKwargs
69
+ model_input_names = ["pixel_values_videos"]
70
+
71
+ def __init__(self, **kwargs: Unpack[InternS1VideoProcessorInitKwargs]):
72
+ super().__init__(**kwargs)
73
+
74
+ def sample_frames(
75
+ self,
76
+ video: "torch.Tensor",
77
+ metadata: Optional[Union[VideoMetadata, dict]] = None,
78
+ num_frames: Optional[int] = None,
79
+ fps: Optional[Union[int, float]] = None,
80
+ initial_shift: Optional[Union[bool, float, int]] = None,
81
+ ):
82
+ """
83
+ Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
84
+ If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
85
+ and `fps` are mutually exclusive.
86
+
87
+ Args:
88
+ video (`torch.Tensor`):
89
+ Video that need to be sampled.
90
+ metadata (`VideoMetadata`, *optional*):
91
+ Metadata of the video containing information about total duration, fps and total number of frames.
92
+ num_frames (`int`, *optional*):
93
+ Maximum number of frames to sample. Defaults to `self.num_frames`.
94
+ fps (`int` or `float`, *optional*):
95
+ Target frames to sample per second. Defaults to `self.fps`.
96
+ initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
97
+ The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
98
+
99
+ Returns:
100
+ torch.Tensor:
101
+ Sampled video frames.
102
+ """
103
+ num_frames = num_frames if num_frames is not None else self.num_frames
104
+ initial_shift = initial_shift if initial_shift is not None else self.initial_shift
105
+ total_num_frames = video.shape[0]
106
+
107
+ # If num_frames is not given but fps is, calculate num_frames from fps
108
+ if num_frames is None and fps is not None:
109
+ if metadata is None:
110
+ raise ValueError(
111
+ "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
112
+ "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
113
+ )
114
+ num_frames = int(total_num_frames / metadata["fps"] * fps)
115
+
116
+ if initial_shift is True:
117
+ initial_shift = total_num_frames / num_frames / 2
118
+
119
+ if num_frames > total_num_frames:
120
+ raise ValueError(
121
+ f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
122
+ )
123
+
124
+ indices = torch.arange(initial_shift, total_num_frames, total_num_frames / num_frames).int()
125
+ video = video[indices].contiguous()
126
+ return video
127
+
128
+ def _preprocess(
129
+ self,
130
+ videos: list["torch.Tensor"],
131
+ video_metadata: Union[list[VideoMetadata], list[dict]],
132
+ do_convert_rgb: bool,
133
+ do_resize: bool,
134
+ size: SizeDict,
135
+ size_divisor: Optional[int],
136
+ interpolation: Optional["F.InterpolationMode"],
137
+ do_center_crop: bool,
138
+ crop_size: SizeDict,
139
+ do_rescale: bool,
140
+ do_pad: bool,
141
+ rescale_factor: float,
142
+ do_normalize: bool,
143
+ image_mean: Optional[Union[float, list[float]]],
144
+ image_std: Optional[Union[float, list[float]]],
145
+ do_sample_frames: Optional[bool] = None,
146
+ fps: Optional[Union[int, float]] = None,
147
+ num_frames: Optional[int] = None,
148
+ initial_shift: Optional[Union[bool, float, int]] = None,
149
+ return_tensors: Optional[Union[str, TensorType]] = None,
150
+ device: Optional["torch.Tensor"] = None,
151
+ ) -> BatchFeature:
152
+ if do_sample_frames:
153
+ # Sample video frames
154
+ videos = [
155
+ self.sample_frames(video, metadata, fps=fps, num_frames=num_frames, initial_shift=initial_shift)
156
+ for video, metadata in zip(videos, video_metadata)
157
+ ]
158
+
159
+ # We need to sample frames first before moving to device, if `do_sample_frames=True`. Otherwise
160
+ # moving the whole video incurs high GPU mem usage for long videos
161
+ if device is not None:
162
+ videos = [video.to(device) for video in videos]
163
+
164
+ # Group videos by size for batched resizing
165
+ grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
166
+ resized_videos_grouped = {}
167
+ for shape, stacked_videos in grouped_videos.items():
168
+ if do_convert_rgb:
169
+ stacked_videos = self.convert_to_rgb(stacked_videos)
170
+ if do_resize:
171
+ stacked_videos = self.resize(
172
+ stacked_videos, size=size, size_divisor=size_divisor, interpolation=interpolation
173
+ )
174
+ resized_videos_grouped[shape] = stacked_videos
175
+ resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
176
+
177
+ # Group videos by size for further processing
178
+ # Needed in case do_resize is False, or resize returns videos with different sizes
179
+ grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
180
+ processed_videos_grouped = {}
181
+ for shape, stacked_videos in grouped_videos.items():
182
+ if do_center_crop:
183
+ stacked_videos = self.center_crop(stacked_videos, crop_size)
184
+ # Fused rescale and normalize
185
+ stacked_videos = self.rescale_and_normalize(
186
+ stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
187
+ )
188
+ processed_videos_grouped[shape] = stacked_videos
189
+
190
+ processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
191
+ processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
192
+
193
+ return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
194
+
195
+
196
+ __all__ = ["InternS1VideoProcessor"]
vocab.json ADDED
The diff for this file is too large to render. See raw diff