Sweaterdog commited on
Commit
c750975
·
verified ·
1 Parent(s): a171afc

Upload 16 files

Browse files
config.json CHANGED
@@ -1,190 +1,138 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "architectures": [
3
- "Qwen2_5_VLForConditionalGeneration"
4
  ],
5
  "attention_dropout": 0.0,
6
  "eos_token_id": 151645,
7
  "hidden_act": "silu",
8
  "hidden_size": 3584,
9
- "image_token_id": 151655,
10
  "initializer_range": 0.02,
11
  "intermediate_size": 18944,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "max_position_embeddings": 128000,
13
  "max_window_layers": 28,
14
- "model_type": "qwen2_5_vl",
15
  "num_attention_heads": 28,
16
  "num_hidden_layers": 28,
17
  "num_key_value_heads": 4,
18
  "pad_token_id": 151654,
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": {
21
- "mrope_section": [
22
- 16,
23
- 24,
24
- 24
25
- ],
26
- "rope_type": "default",
27
- "type": "default"
28
  },
29
  "rope_theta": 1000000.0,
30
- "sliding_window": 32768,
31
- "text_config": {
32
- "architectures": [
33
- "Qwen2_5_VLForConditionalGeneration"
34
- ],
35
- "attention_dropout": 0.0,
36
- "eos_token_id": 151645,
37
- "hidden_act": "silu",
38
- "hidden_size": 3584,
39
- "image_token_id": null,
40
- "initializer_range": 0.02,
41
- "intermediate_size": 18944,
42
- "max_position_embeddings": 128000,
43
- "max_window_layers": 28,
44
- "model_type": "qwen2_5_vl_text",
45
- "num_attention_heads": 28,
46
- "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
- "pad_token_id": 151654,
49
- "quantization_config": {
50
- "_load_in_4bit": true,
51
- "_load_in_8bit": false,
52
- "bnb_4bit_compute_dtype": "bfloat16",
53
- "bnb_4bit_quant_storage": "uint8",
54
- "bnb_4bit_quant_type": "nf4",
55
- "bnb_4bit_use_double_quant": true,
56
- "llm_int8_enable_fp32_cpu_offload": false,
57
- "llm_int8_has_fp16_weight": false,
58
- "llm_int8_skip_modules": [
59
- "lm_head",
60
- "multi_modal_projector",
61
- "merger",
62
- "modality_projection",
63
- "visual.blocks.27.attn",
64
- "visual.blocks.28.attn",
65
- "visual.blocks.25.attn",
66
- "visual.blocks.22.attn",
67
- "visual.blocks.21.attn",
68
- "visual.blocks.29.mlp",
69
- "visual.blocks.24.attn",
70
- "visual.blocks.29.attn",
71
- "visual.blocks.31.attn",
72
- "visual.blocks.30.attn",
73
- "visual.blocks.28.mlp",
74
- "visual.blocks.20.attn",
75
- "visual.blocks.25.mlp",
76
- "visual.blocks.19.attn",
77
- "visual.blocks.26.mlp",
78
- "visual.blocks.24.mlp",
79
- "visual.blocks.17.attn",
80
- "visual.blocks.27.mlp",
81
- "visual.blocks.23.attn",
82
- "visual.blocks.23.mlp",
83
- "visual.blocks.21.mlp",
84
- "visual.blocks.19.mlp",
85
- "visual.blocks.18.attn",
86
- "visual.blocks.20.mlp",
87
- "visual.blocks.11.attn",
88
- "visual.blocks.9.mlp",
89
- "visual.blocks.9.attn",
90
- "visual.blocks.16.attn",
91
- "visual.blocks.11.mlp",
92
- "visual.blocks.22.mlp",
93
- "visual.blocks.18.mlp",
94
- "visual.blocks.13.attn",
95
- "visual.blocks.12.attn",
96
- "visual.blocks.6.attn",
97
- "visual.blocks.10.mlp",
98
- "visual.blocks.8.mlp",
99
- "visual.blocks.8.attn",
100
- "visual.blocks.14.attn",
101
- "visual.blocks.4.mlp",
102
- "visual.blocks.16.mlp",
103
- "visual.blocks.7.mlp",
104
- "visual.blocks.6.mlp",
105
- "visual.blocks.15.mlp",
106
- "visual.blocks.5.mlp",
107
- "visual.blocks.10.attn",
108
- "visual.blocks.3.mlp",
109
- "visual.blocks.12.mlp",
110
- "visual.blocks.13.mlp",
111
- "visual.blocks.14.mlp",
112
- "visual.blocks.2.mlp",
113
- "visual.blocks.5.attn",
114
- "visual.blocks.1.attn",
115
- "visual.blocks.2.attn",
116
- "visual.blocks.4.attn",
117
- "visual.blocks.3.attn",
118
- "visual.blocks.15.attn",
119
- "visual.blocks.1.mlp",
120
- "visual.blocks.17.mlp",
121
- "visual.blocks.0.attn",
122
- "visual.blocks.7.attn",
123
- "visual.blocks.0.mlp",
124
- "visual.blocks.31.mlp.down_proj"
125
- ],
126
- "llm_int8_threshold": 6.0,
127
- "load_in_4bit": true,
128
- "load_in_8bit": false,
129
- "quant_method": "bitsandbytes"
130
- },
131
- "rms_norm_eps": 1e-06,
132
- "rope_scaling": {
133
- "mrope_section": [
134
- 16,
135
- 24,
136
- 24
137
- ],
138
- "rope_type": "default",
139
- "type": "default"
140
- },
141
- "rope_theta": 1000000.0,
142
- "sliding_window": 32768,
143
- "torch_dtype": "float16",
144
- "unsloth_fixed": true,
145
- "use_cache": true,
146
- "use_sliding_window": false,
147
- "video_token_id": null,
148
- "vision_end_token_id": 151653,
149
- "vision_start_token_id": 151652,
150
- "vision_token_id": 151654,
151
- "vocab_size": 152064
152
- },
153
- "tie_word_embeddings": false,
154
- "torch_dtype": "float16",
155
- "transformers_version": "4.52.4",
156
  "unsloth_fixed": true,
157
- "unsloth_version": "2025.6.8",
158
  "use_cache": true,
159
  "use_sliding_window": false,
160
- "video_token_id": 151656,
161
- "vision_config": {
162
- "depth": 32,
163
- "fullatt_block_indexes": [
164
- 7,
165
- 15,
166
- 23,
167
- 31
168
- ],
169
- "hidden_act": "silu",
170
- "hidden_size": 1280,
171
- "in_channels": 3,
172
- "in_chans": 3,
173
- "initializer_range": 0.02,
174
- "intermediate_size": 3420,
175
- "model_type": "qwen2_5_vl",
176
- "num_heads": 16,
177
- "out_hidden_size": 3584,
178
- "patch_size": 14,
179
- "spatial_merge_size": 2,
180
- "spatial_patch_size": 14,
181
- "temporal_patch_size": 2,
182
- "tokens_per_second": 2,
183
- "torch_dtype": "float16",
184
- "window_size": 112
185
- },
186
  "vision_end_token_id": 151653,
187
  "vision_start_token_id": 151652,
188
  "vision_token_id": 151654,
189
  "vocab_size": 152064
190
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "eos_token_id": 151645,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 3584,
9
+ "image_token_id": 151655,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 128000,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2_5_vl",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "pad_token_id": 151654,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": {
21
+ "mrope_section": [
22
+ 16,
23
+ 24,
24
+ 24
25
+ ],
26
+ "rope_type": "default",
27
+ "type": "default"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "text_config": {
32
  "architectures": [
33
+ "Qwen2_5_VLForConditionalGeneration"
34
  ],
35
  "attention_dropout": 0.0,
36
  "eos_token_id": 151645,
37
  "hidden_act": "silu",
38
  "hidden_size": 3584,
39
+ "image_token_id": null,
40
  "initializer_range": 0.02,
41
  "intermediate_size": 18944,
42
+ "layer_types": [
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention"
71
+ ],
72
  "max_position_embeddings": 128000,
73
  "max_window_layers": 28,
74
+ "model_type": "qwen2_5_vl_text",
75
  "num_attention_heads": 28,
76
  "num_hidden_layers": 28,
77
  "num_key_value_heads": 4,
78
  "pad_token_id": 151654,
79
  "rms_norm_eps": 1e-06,
80
  "rope_scaling": {
81
+ "mrope_section": [
82
+ 16,
83
+ 24,
84
+ 24
85
+ ],
86
+ "rope_type": "default",
87
+ "type": "default"
88
  },
89
  "rope_theta": 1000000.0,
90
+ "sliding_window": null,
91
+ "torch_dtype": "bfloat16",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "unsloth_fixed": true,
 
93
  "use_cache": true,
94
  "use_sliding_window": false,
95
+ "video_token_id": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  "vision_end_token_id": 151653,
97
  "vision_start_token_id": 151652,
98
  "vision_token_id": 151654,
99
  "vocab_size": 152064
100
+ },
101
+ "tie_word_embeddings": false,
102
+ "torch_dtype": "bfloat16",
103
+ "transformers_version": "4.53.0",
104
+ "unsloth_fixed": true,
105
+ "unsloth_version": "2025.6.12",
106
+ "use_cache": true,
107
+ "use_sliding_window": false,
108
+ "video_token_id": 151656,
109
+ "vision_config": {
110
+ "depth": 32,
111
+ "fullatt_block_indexes": [
112
+ 7,
113
+ 15,
114
+ 23,
115
+ 31
116
+ ],
117
+ "hidden_act": "silu",
118
+ "hidden_size": 1280,
119
+ "in_channels": 3,
120
+ "in_chans": 3,
121
+ "initializer_range": 0.02,
122
+ "intermediate_size": 3420,
123
+ "model_type": "qwen2_5_vl",
124
+ "num_heads": 16,
125
+ "out_hidden_size": 3584,
126
+ "patch_size": 14,
127
+ "spatial_merge_size": 2,
128
+ "spatial_patch_size": 14,
129
+ "temporal_patch_size": 2,
130
+ "tokens_per_second": 2,
131
+ "torch_dtype": "bfloat16",
132
+ "window_size": 112
133
+ },
134
+ "vision_end_token_id": 151653,
135
+ "vision_start_token_id": 151652,
136
+ "vision_token_id": 151654,
137
+ "vocab_size": 152064
138
+ }
generation_config.json CHANGED
@@ -9,5 +9,5 @@
9
  "pad_token_id": 151654,
10
  "repetition_penalty": 1.05,
11
  "temperature": 1e-06,
12
- "transformers_version": "4.52.4"
13
  }
 
9
  "pad_token_id": 151654,
10
  "repetition_penalty": 1.05,
11
  "temperature": 1e-06,
12
+ "transformers_version": "4.53.0"
13
  }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d725335e4ea2399be706469e4b8807716a8fa64bd03468252e9f7acf2415fee4
3
+ size 4968243304
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1830db6908dcc76df3a71492acbcf2b8cac130114cf1f3c2d9edae8de8c6de3
3
+ size 4991495816
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09c1807c6d00d7cab94f7db39d4c02ebb8537225ccde383861ac48db97945aa6
3
+ size 4932751040
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dd068336d14d45ffb43cef374d286cc6ba9d8741b028f90a7d040d847961f4a
3
+ size 1691924384
video_preprocessor_config.json CHANGED
@@ -1,28 +1,4 @@
1
  {
2
- "_valid_kwargs_names": [
3
- "do_convert_rgb",
4
- "do_resize",
5
- "size",
6
- "size_divisor",
7
- "default_to_square",
8
- "resample",
9
- "do_rescale",
10
- "rescale_factor",
11
- "do_normalize",
12
- "image_mean",
13
- "image_std",
14
- "do_pad",
15
- "do_center_crop",
16
- "crop_size",
17
- "data_format",
18
- "input_data_format",
19
- "device",
20
- "min_pixels",
21
- "max_pixels",
22
- "patch_size",
23
- "temporal_patch_size",
24
- "merge_size"
25
- ],
26
  "crop_size": null,
27
  "data_format": "channels_first",
28
  "default_to_square": true,
@@ -33,45 +9,25 @@
33
  "do_pad": null,
34
  "do_rescale": true,
35
  "do_resize": true,
 
 
36
  "image_mean": [
37
  0.48145466,
38
  0.4578275,
39
  0.40821073
40
  ],
41
- "image_processor_type": "Qwen2VLImageProcessor",
42
  "image_std": [
43
  0.26862954,
44
  0.26130258,
45
  0.27577711
46
  ],
47
  "input_data_format": null,
 
48
  "max_pixels": 12845056,
49
  "merge_size": 2,
 
50
  "min_pixels": 3136,
51
- "model_valid_processing_keys": [
52
- "do_convert_rgb",
53
- "do_resize",
54
- "size",
55
- "size_divisor",
56
- "default_to_square",
57
- "resample",
58
- "do_rescale",
59
- "rescale_factor",
60
- "do_normalize",
61
- "image_mean",
62
- "image_std",
63
- "do_pad",
64
- "do_center_crop",
65
- "crop_size",
66
- "data_format",
67
- "input_data_format",
68
- "device",
69
- "min_pixels",
70
- "max_pixels",
71
- "patch_size",
72
- "temporal_patch_size",
73
- "merge_size"
74
- ],
75
  "patch_size": 14,
76
  "processor_class": "Qwen2_5_VLProcessor",
77
  "resample": 3,
@@ -82,5 +38,6 @@
82
  },
83
  "size_divisor": null,
84
  "temporal_patch_size": 2,
 
85
  "video_processor_type": "Qwen2VLVideoProcessor"
86
  }
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "crop_size": null,
3
  "data_format": "channels_first",
4
  "default_to_square": true,
 
9
  "do_pad": null,
10
  "do_rescale": true,
11
  "do_resize": true,
12
+ "do_sample_frames": false,
13
+ "fps": null,
14
  "image_mean": [
15
  0.48145466,
16
  0.4578275,
17
  0.40821073
18
  ],
 
19
  "image_std": [
20
  0.26862954,
21
  0.26130258,
22
  0.27577711
23
  ],
24
  "input_data_format": null,
25
+ "max_frames": 768,
26
  "max_pixels": 12845056,
27
  "merge_size": 2,
28
+ "min_frames": 4,
29
  "min_pixels": 3136,
30
+ "num_frames": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "patch_size": 14,
32
  "processor_class": "Qwen2_5_VLProcessor",
33
  "resample": 3,
 
38
  },
39
  "size_divisor": null,
40
  "temporal_patch_size": 2,
41
+ "video_metadata": null,
42
  "video_processor_type": "Qwen2VLVideoProcessor"
43
  }