zhoukz commited on
Commit
44519c3
·
1 Parent(s): 182f777

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -52,7 +52,9 @@ TODO:以下由Qwen2.5-Omni-3B依赖,引入路径未知,需要去除
52
 
53
  >>> with torch.no_grad():
54
  ... model_inputs = processor(text=text, audio=audio)
55
- ... output = model.generate(**model_inputs)
 
 
56
  >>> print(output)
57
  ["An engine is idling.'"]
58
  ```
 
52
 
53
  >>> with torch.no_grad():
54
  ... model_inputs = processor(text=text, audio=audio)
55
+ ... generation = model.generate(**model_inputs)
56
+ ... output = processor.batch_decode(generation, skip_special_tokens=True)
57
+
58
  >>> print(output)
59
  ["An engine is idling.'"]
60
  ```
config.json CHANGED
@@ -2,11 +2,36 @@
2
  "architectures": [
3
  "DashengQwen25OmniModelInstruct"
4
  ],
5
- "audio_encoder": "LemonstoreWrapper",
6
- "audio_encoder_args": {
7
- "model_name": "audiotransformer_huge.dasheng06b.10s",
8
- "pretrained": false,
9
- "target_length": 1008
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  },
11
  "auto_map": {
12
  "AutoConfig": "configuration_midashenglm.MiAudioLLMHFConfig",
@@ -19,9 +44,37 @@
19
  "model_type": "miaudiollm",
20
  "resize_tokenizer": false,
21
  "subsample_factor": 5,
22
- "text_model": "Qwen/Qwen2.5-Omni-3B",
23
- "text_model_args": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "torch_dtype": "float32",
25
- "transformers_version": "4.52.0.dev0",
26
- "use_encoderattention_mask": true
27
  }
 
2
  "architectures": [
3
  "DashengQwen25OmniModelInstruct"
4
  ],
5
+ "audio_encoder_config": {
6
+ "attn_drop_rate": 0.0,
7
+ "center": true,
8
+ "depth": 32,
9
+ "drop_path_rate": 0.0,
10
+ "drop_rate": 0.0,
11
+ "embed_dim": 1280,
12
+ "f_max": 8000.0,
13
+ "f_min": 0.0,
14
+ "hop_length": 160,
15
+ "init_values": null,
16
+ "input_channels": 1,
17
+ "mlp_ratio": 4.0,
18
+ "model_type": "miaudiollm_dasheng_encoder",
19
+ "n_fft": 512,
20
+ "n_mels": 64,
21
+ "num_heads": 16,
22
+ "outputdim": 527,
23
+ "patch_size": [
24
+ 64,
25
+ 4
26
+ ],
27
+ "patch_stride": [
28
+ 64,
29
+ 4
30
+ ],
31
+ "qkv_bias": true,
32
+ "sample_rate": 16000,
33
+ "target_length": 1008,
34
+ "win_length": 512
35
  },
36
  "auto_map": {
37
  "AutoConfig": "configuration_midashenglm.MiAudioLLMHFConfig",
 
44
  "model_type": "miaudiollm",
45
  "resize_tokenizer": false,
46
  "subsample_factor": 5,
47
+ "text_model_config": {
48
+ "_attn_implementation_autoset": true,
49
+ "attention_dropout": 0.0,
50
+ "hidden_act": "silu",
51
+ "hidden_size": 2048,
52
+ "init_std": 0.02,
53
+ "initializer_range": 0.02,
54
+ "intermediate_size": 11008,
55
+ "max_position_embeddings": 32768,
56
+ "max_window_layers": 70,
57
+ "model_type": "qwen2_5_omni_text",
58
+ "num_attention_heads": 16,
59
+ "num_hidden_layers": 36,
60
+ "num_key_value_heads": 2,
61
+ "rms_norm_eps": 1e-06,
62
+ "rope_scaling": {
63
+ "mrope_section": [
64
+ 16,
65
+ 24,
66
+ 24
67
+ ],
68
+ "rope_type": "default",
69
+ "type": "default"
70
+ },
71
+ "rope_theta": 1000000.0,
72
+ "sliding_window": 32768,
73
+ "torch_dtype": "bfloat16",
74
+ "use_cache": true,
75
+ "use_sliding_window": false,
76
+ "vocab_size": 152064
77
+ },
78
  "torch_dtype": "float32",
79
+ "transformers_version": "4.52.0.dev0"
 
80
  }
configuration_midashenglm.py CHANGED
@@ -1,6 +1,64 @@
1
- from typing import Literal
 
2
 
3
  from transformers import PretrainedConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  class MiAudioLLMHFConfig(PretrainedConfig):
@@ -9,25 +67,21 @@ class MiAudioLLMHFConfig(PretrainedConfig):
9
  def __init__(
10
  self,
11
  model: str = "DashengQwen2ModelInstruct",
12
- audio_encoder="LemonstoreWrapper",
13
- audio_encoder_args=dict(
14
- model_name="audiotransformer_base.dasheng.10s", pretrained=True
15
- ),
16
- text_model="Qwen/Qwen2.5-0.5B-Instruct",
17
- text_model_args=dict(),
18
  freeze: Literal["audio", "text"] | str | None = None,
19
  lora: Literal["encoder", "decoder"] | None = None,
20
  subsample_factor: int = 5,
21
- use_encoderattention_mask: bool = True,
22
  **kwargs,
23
  ):
24
  self.model = model
25
- self.audio_encoder = audio_encoder
26
- self.audio_encoder_args = audio_encoder_args
27
- self.text_model = text_model
28
- self.text_model_args = text_model_args
29
  self.freeze = freeze
30
  self.lora = lora
31
  self.subsample_factor = subsample_factor
32
- self.use_encoderattention_mask = use_encoderattention_mask
 
 
 
 
33
  super().__init__(**kwargs)
 
1
+ from ast import Dict
2
+ from typing import Literal, Tuple, Union
3
 
4
  from transformers import PretrainedConfig
5
+ from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
6
+ Qwen2_5OmniTextConfig,
7
+ )
8
+
9
+
10
+ class DashengConfig(PretrainedConfig):
11
+ model_type = "miaudiollm_dasheng_encoder"
12
+
13
+ def __init__(
14
+ self,
15
+ embed_dim: int = 768,
16
+ outputdim: int = 527,
17
+ patch_size: Union[int, Tuple[int, int]] = 16,
18
+ patch_stride: Union[int, Tuple[int, int]] = 16,
19
+ input_channels: int = 1,
20
+ target_length: int = 1012,
21
+ depth: int = 12,
22
+ num_heads: int = 12,
23
+ mlp_ratio: float = 4.0,
24
+ qkv_bias: bool = True,
25
+ init_values: float | None = None,
26
+ drop_rate: float = 0.0,
27
+ attn_drop_rate: float = 0.0,
28
+ drop_path_rate: float = 0.0,
29
+ f_min: float = 0.0,
30
+ f_max: float = 8000.0,
31
+ center: bool = True,
32
+ win_length: int = 512,
33
+ hop_length: int = 160,
34
+ sample_rate: int = 16000,
35
+ n_fft: int = 512,
36
+ n_mels: int = 64,
37
+ **kwargs,
38
+ ):
39
+ self.embed_dim = embed_dim
40
+ self.outputdim = outputdim
41
+ self.patch_size = patch_size
42
+ self.patch_stride = patch_stride
43
+ self.input_channels = input_channels
44
+ self.target_length = target_length
45
+ self.depth = depth
46
+ self.num_heads = num_heads
47
+ self.mlp_ratio = mlp_ratio
48
+ self.qkv_bias = qkv_bias
49
+ self.init_values = init_values
50
+ self.drop_rate = drop_rate
51
+ self.attn_drop_rate = attn_drop_rate
52
+ self.drop_path_rate = drop_path_rate
53
+ self.f_min = f_min
54
+ self.f_max = f_max
55
+ self.center = center
56
+ self.win_length = win_length
57
+ self.hop_length = hop_length
58
+ self.sample_rate = sample_rate
59
+ self.n_fft = n_fft
60
+ self.n_mels = n_mels
61
+ super().__init__(**kwargs)
62
 
63
 
64
  class MiAudioLLMHFConfig(PretrainedConfig):
 
67
  def __init__(
68
  self,
69
  model: str = "DashengQwen2ModelInstruct",
70
+ audio_encoder_config: Dict = {},
 
 
 
 
 
71
  freeze: Literal["audio", "text"] | str | None = None,
72
  lora: Literal["encoder", "decoder"] | None = None,
73
  subsample_factor: int = 5,
74
+ text_model_config: Dict = None,
75
  **kwargs,
76
  ):
77
  self.model = model
78
+ self.audio_encoder_config = DashengConfig(**audio_encoder_config)
 
 
 
79
  self.freeze = freeze
80
  self.lora = lora
81
  self.subsample_factor = subsample_factor
82
+ self.text_model_config = (
83
+ Qwen2_5OmniTextConfig(**text_model_config)
84
+ if text_model_config
85
+ else Qwen2_5OmniTextConfig()
86
+ )
87
  super().__init__(**kwargs)
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token_id": [
3
+ 151643,
4
+ 151645
5
+ ],
6
+ "pad_token_id": 151643,
7
+ "transformers_version": "4.52.0.dev0"
8
+ }
model.safetensors.index.json CHANGED
@@ -1,405 +1,405 @@
1
  {
2
  "metadata": {
3
- "total_size": 9383791884
4
  },
5
  "weight_map": {
6
- "audio_encoder.model.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
7
- "audio_encoder.model.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
8
- "audio_encoder.model.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
9
- "audio_encoder.model.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
10
- "audio_encoder.model.blocks.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
11
- "audio_encoder.model.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
12
- "audio_encoder.model.blocks.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
13
- "audio_encoder.model.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
14
- "audio_encoder.model.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
15
- "audio_encoder.model.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
16
- "audio_encoder.model.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
17
- "audio_encoder.model.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
18
- "audio_encoder.model.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
19
- "audio_encoder.model.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
20
- "audio_encoder.model.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
21
- "audio_encoder.model.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
22
- "audio_encoder.model.blocks.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
23
- "audio_encoder.model.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
24
- "audio_encoder.model.blocks.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
25
- "audio_encoder.model.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
26
- "audio_encoder.model.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
27
- "audio_encoder.model.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
28
- "audio_encoder.model.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
29
- "audio_encoder.model.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
30
- "audio_encoder.model.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
31
- "audio_encoder.model.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
32
- "audio_encoder.model.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
33
- "audio_encoder.model.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
34
- "audio_encoder.model.blocks.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
35
- "audio_encoder.model.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
36
- "audio_encoder.model.blocks.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
37
- "audio_encoder.model.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
38
- "audio_encoder.model.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
39
- "audio_encoder.model.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
40
- "audio_encoder.model.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
41
- "audio_encoder.model.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
42
- "audio_encoder.model.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
43
- "audio_encoder.model.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
44
- "audio_encoder.model.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
45
- "audio_encoder.model.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
46
- "audio_encoder.model.blocks.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
47
- "audio_encoder.model.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
48
- "audio_encoder.model.blocks.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
49
- "audio_encoder.model.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
50
- "audio_encoder.model.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
51
- "audio_encoder.model.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
52
- "audio_encoder.model.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
53
- "audio_encoder.model.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
54
- "audio_encoder.model.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
55
- "audio_encoder.model.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
56
- "audio_encoder.model.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
57
- "audio_encoder.model.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
58
- "audio_encoder.model.blocks.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
59
- "audio_encoder.model.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
60
- "audio_encoder.model.blocks.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
61
- "audio_encoder.model.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
62
- "audio_encoder.model.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
63
- "audio_encoder.model.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
64
- "audio_encoder.model.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
65
- "audio_encoder.model.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
66
- "audio_encoder.model.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
67
- "audio_encoder.model.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
68
- "audio_encoder.model.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
69
- "audio_encoder.model.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
70
- "audio_encoder.model.blocks.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
71
- "audio_encoder.model.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
72
- "audio_encoder.model.blocks.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
73
- "audio_encoder.model.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
74
- "audio_encoder.model.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
75
- "audio_encoder.model.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
76
- "audio_encoder.model.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
77
- "audio_encoder.model.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
78
- "audio_encoder.model.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
79
- "audio_encoder.model.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
80
- "audio_encoder.model.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
81
- "audio_encoder.model.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
82
- "audio_encoder.model.blocks.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
83
- "audio_encoder.model.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
84
- "audio_encoder.model.blocks.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
85
- "audio_encoder.model.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
86
- "audio_encoder.model.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
87
- "audio_encoder.model.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
88
- "audio_encoder.model.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
89
- "audio_encoder.model.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
90
- "audio_encoder.model.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
91
- "audio_encoder.model.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
92
- "audio_encoder.model.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
93
- "audio_encoder.model.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
94
- "audio_encoder.model.blocks.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
95
- "audio_encoder.model.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
96
- "audio_encoder.model.blocks.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
97
- "audio_encoder.model.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
98
- "audio_encoder.model.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
99
- "audio_encoder.model.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
100
- "audio_encoder.model.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
101
- "audio_encoder.model.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
102
- "audio_encoder.model.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
103
- "audio_encoder.model.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
104
- "audio_encoder.model.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
105
- "audio_encoder.model.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
106
- "audio_encoder.model.blocks.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
107
- "audio_encoder.model.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
108
- "audio_encoder.model.blocks.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
109
- "audio_encoder.model.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
110
- "audio_encoder.model.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
111
- "audio_encoder.model.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
112
- "audio_encoder.model.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
113
- "audio_encoder.model.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
114
- "audio_encoder.model.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
115
- "audio_encoder.model.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
116
- "audio_encoder.model.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
117
- "audio_encoder.model.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
118
- "audio_encoder.model.blocks.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
119
- "audio_encoder.model.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
120
- "audio_encoder.model.blocks.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
121
- "audio_encoder.model.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
122
- "audio_encoder.model.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
123
- "audio_encoder.model.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
124
- "audio_encoder.model.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
125
- "audio_encoder.model.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
126
- "audio_encoder.model.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
127
- "audio_encoder.model.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
128
- "audio_encoder.model.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
129
- "audio_encoder.model.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
130
- "audio_encoder.model.blocks.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
131
- "audio_encoder.model.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
132
- "audio_encoder.model.blocks.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
133
- "audio_encoder.model.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
134
- "audio_encoder.model.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
135
- "audio_encoder.model.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
136
- "audio_encoder.model.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
137
- "audio_encoder.model.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
138
- "audio_encoder.model.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
139
- "audio_encoder.model.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
140
- "audio_encoder.model.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
141
- "audio_encoder.model.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
142
- "audio_encoder.model.blocks.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
143
- "audio_encoder.model.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
144
- "audio_encoder.model.blocks.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
145
- "audio_encoder.model.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
146
- "audio_encoder.model.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
147
- "audio_encoder.model.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
148
- "audio_encoder.model.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
149
- "audio_encoder.model.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
150
- "audio_encoder.model.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
151
- "audio_encoder.model.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
152
- "audio_encoder.model.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
153
- "audio_encoder.model.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
154
- "audio_encoder.model.blocks.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
155
- "audio_encoder.model.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
156
- "audio_encoder.model.blocks.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
157
- "audio_encoder.model.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
158
- "audio_encoder.model.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
159
- "audio_encoder.model.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
160
- "audio_encoder.model.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
161
- "audio_encoder.model.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
162
- "audio_encoder.model.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
163
- "audio_encoder.model.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
164
- "audio_encoder.model.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
165
- "audio_encoder.model.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
166
- "audio_encoder.model.blocks.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
167
- "audio_encoder.model.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
168
- "audio_encoder.model.blocks.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
169
- "audio_encoder.model.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
170
- "audio_encoder.model.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
171
- "audio_encoder.model.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
172
- "audio_encoder.model.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
173
- "audio_encoder.model.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
174
- "audio_encoder.model.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
175
- "audio_encoder.model.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
176
- "audio_encoder.model.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
177
- "audio_encoder.model.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
178
- "audio_encoder.model.blocks.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
179
- "audio_encoder.model.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
180
- "audio_encoder.model.blocks.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
181
- "audio_encoder.model.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
182
- "audio_encoder.model.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
183
- "audio_encoder.model.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
184
- "audio_encoder.model.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
185
- "audio_encoder.model.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
186
- "audio_encoder.model.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
187
- "audio_encoder.model.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
188
- "audio_encoder.model.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
189
- "audio_encoder.model.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
190
- "audio_encoder.model.blocks.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
191
- "audio_encoder.model.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
192
- "audio_encoder.model.blocks.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
193
- "audio_encoder.model.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
194
- "audio_encoder.model.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
195
- "audio_encoder.model.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
196
- "audio_encoder.model.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
197
- "audio_encoder.model.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
198
- "audio_encoder.model.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
199
- "audio_encoder.model.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
200
- "audio_encoder.model.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
201
- "audio_encoder.model.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
202
- "audio_encoder.model.blocks.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
203
- "audio_encoder.model.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
204
- "audio_encoder.model.blocks.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
205
- "audio_encoder.model.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
206
- "audio_encoder.model.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
207
- "audio_encoder.model.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
208
- "audio_encoder.model.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
209
- "audio_encoder.model.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
210
- "audio_encoder.model.blocks.24.attn.proj.bias": "model-00001-of-00002.safetensors",
211
- "audio_encoder.model.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
212
- "audio_encoder.model.blocks.24.attn.qkv.bias": "model-00001-of-00002.safetensors",
213
- "audio_encoder.model.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
214
- "audio_encoder.model.blocks.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
215
- "audio_encoder.model.blocks.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
216
- "audio_encoder.model.blocks.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
217
- "audio_encoder.model.blocks.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
218
- "audio_encoder.model.blocks.24.norm1.bias": "model-00001-of-00002.safetensors",
219
- "audio_encoder.model.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
220
- "audio_encoder.model.blocks.24.norm2.bias": "model-00001-of-00002.safetensors",
221
- "audio_encoder.model.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
222
- "audio_encoder.model.blocks.25.attn.proj.bias": "model-00001-of-00002.safetensors",
223
- "audio_encoder.model.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
224
- "audio_encoder.model.blocks.25.attn.qkv.bias": "model-00001-of-00002.safetensors",
225
- "audio_encoder.model.blocks.25.attn.qkv.weight": "model-00001-of-00002.safetensors",
226
- "audio_encoder.model.blocks.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
227
- "audio_encoder.model.blocks.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
228
- "audio_encoder.model.blocks.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
229
- "audio_encoder.model.blocks.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
230
- "audio_encoder.model.blocks.25.norm1.bias": "model-00001-of-00002.safetensors",
231
- "audio_encoder.model.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
232
- "audio_encoder.model.blocks.25.norm2.bias": "model-00001-of-00002.safetensors",
233
- "audio_encoder.model.blocks.25.norm2.weight": "model-00001-of-00002.safetensors",
234
- "audio_encoder.model.blocks.26.attn.proj.bias": "model-00001-of-00002.safetensors",
235
- "audio_encoder.model.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
236
- "audio_encoder.model.blocks.26.attn.qkv.bias": "model-00001-of-00002.safetensors",
237
- "audio_encoder.model.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
238
- "audio_encoder.model.blocks.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
239
- "audio_encoder.model.blocks.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
240
- "audio_encoder.model.blocks.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
241
- "audio_encoder.model.blocks.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
242
- "audio_encoder.model.blocks.26.norm1.bias": "model-00001-of-00002.safetensors",
243
- "audio_encoder.model.blocks.26.norm1.weight": "model-00001-of-00002.safetensors",
244
- "audio_encoder.model.blocks.26.norm2.bias": "model-00001-of-00002.safetensors",
245
- "audio_encoder.model.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
246
- "audio_encoder.model.blocks.27.attn.proj.bias": "model-00001-of-00002.safetensors",
247
- "audio_encoder.model.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
248
- "audio_encoder.model.blocks.27.attn.qkv.bias": "model-00001-of-00002.safetensors",
249
- "audio_encoder.model.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
250
- "audio_encoder.model.blocks.27.mlp.fc1.bias": "model-00001-of-00002.safetensors",
251
- "audio_encoder.model.blocks.27.mlp.fc1.weight": "model-00001-of-00002.safetensors",
252
- "audio_encoder.model.blocks.27.mlp.fc2.bias": "model-00001-of-00002.safetensors",
253
- "audio_encoder.model.blocks.27.mlp.fc2.weight": "model-00001-of-00002.safetensors",
254
- "audio_encoder.model.blocks.27.norm1.bias": "model-00001-of-00002.safetensors",
255
- "audio_encoder.model.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
256
- "audio_encoder.model.blocks.27.norm2.bias": "model-00001-of-00002.safetensors",
257
- "audio_encoder.model.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
258
- "audio_encoder.model.blocks.28.attn.proj.bias": "model-00001-of-00002.safetensors",
259
- "audio_encoder.model.blocks.28.attn.proj.weight": "model-00001-of-00002.safetensors",
260
- "audio_encoder.model.blocks.28.attn.qkv.bias": "model-00001-of-00002.safetensors",
261
- "audio_encoder.model.blocks.28.attn.qkv.weight": "model-00001-of-00002.safetensors",
262
- "audio_encoder.model.blocks.28.mlp.fc1.bias": "model-00001-of-00002.safetensors",
263
- "audio_encoder.model.blocks.28.mlp.fc1.weight": "model-00001-of-00002.safetensors",
264
- "audio_encoder.model.blocks.28.mlp.fc2.bias": "model-00001-of-00002.safetensors",
265
- "audio_encoder.model.blocks.28.mlp.fc2.weight": "model-00001-of-00002.safetensors",
266
- "audio_encoder.model.blocks.28.norm1.bias": "model-00001-of-00002.safetensors",
267
- "audio_encoder.model.blocks.28.norm1.weight": "model-00001-of-00002.safetensors",
268
- "audio_encoder.model.blocks.28.norm2.bias": "model-00001-of-00002.safetensors",
269
- "audio_encoder.model.blocks.28.norm2.weight": "model-00001-of-00002.safetensors",
270
- "audio_encoder.model.blocks.29.attn.proj.bias": "model-00001-of-00002.safetensors",
271
- "audio_encoder.model.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
272
- "audio_encoder.model.blocks.29.attn.qkv.bias": "model-00001-of-00002.safetensors",
273
- "audio_encoder.model.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
274
- "audio_encoder.model.blocks.29.mlp.fc1.bias": "model-00001-of-00002.safetensors",
275
- "audio_encoder.model.blocks.29.mlp.fc1.weight": "model-00001-of-00002.safetensors",
276
- "audio_encoder.model.blocks.29.mlp.fc2.bias": "model-00001-of-00002.safetensors",
277
- "audio_encoder.model.blocks.29.mlp.fc2.weight": "model-00001-of-00002.safetensors",
278
- "audio_encoder.model.blocks.29.norm1.bias": "model-00001-of-00002.safetensors",
279
- "audio_encoder.model.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
280
- "audio_encoder.model.blocks.29.norm2.bias": "model-00001-of-00002.safetensors",
281
- "audio_encoder.model.blocks.29.norm2.weight": "model-00001-of-00002.safetensors",
282
- "audio_encoder.model.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
283
- "audio_encoder.model.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
284
- "audio_encoder.model.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
285
- "audio_encoder.model.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
286
- "audio_encoder.model.blocks.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
287
- "audio_encoder.model.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
288
- "audio_encoder.model.blocks.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
289
- "audio_encoder.model.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
290
- "audio_encoder.model.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
291
- "audio_encoder.model.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
292
- "audio_encoder.model.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
293
- "audio_encoder.model.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
294
- "audio_encoder.model.blocks.30.attn.proj.bias": "model-00001-of-00002.safetensors",
295
- "audio_encoder.model.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
296
- "audio_encoder.model.blocks.30.attn.qkv.bias": "model-00001-of-00002.safetensors",
297
- "audio_encoder.model.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
298
- "audio_encoder.model.blocks.30.mlp.fc1.bias": "model-00001-of-00002.safetensors",
299
- "audio_encoder.model.blocks.30.mlp.fc1.weight": "model-00001-of-00002.safetensors",
300
- "audio_encoder.model.blocks.30.mlp.fc2.bias": "model-00001-of-00002.safetensors",
301
- "audio_encoder.model.blocks.30.mlp.fc2.weight": "model-00001-of-00002.safetensors",
302
- "audio_encoder.model.blocks.30.norm1.bias": "model-00001-of-00002.safetensors",
303
- "audio_encoder.model.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
304
- "audio_encoder.model.blocks.30.norm2.bias": "model-00001-of-00002.safetensors",
305
- "audio_encoder.model.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
306
- "audio_encoder.model.blocks.31.attn.proj.bias": "model-00001-of-00002.safetensors",
307
- "audio_encoder.model.blocks.31.attn.proj.weight": "model-00001-of-00002.safetensors",
308
- "audio_encoder.model.blocks.31.attn.qkv.bias": "model-00001-of-00002.safetensors",
309
- "audio_encoder.model.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
310
- "audio_encoder.model.blocks.31.mlp.fc1.bias": "model-00001-of-00002.safetensors",
311
- "audio_encoder.model.blocks.31.mlp.fc1.weight": "model-00001-of-00002.safetensors",
312
- "audio_encoder.model.blocks.31.mlp.fc2.bias": "model-00001-of-00002.safetensors",
313
- "audio_encoder.model.blocks.31.mlp.fc2.weight": "model-00001-of-00002.safetensors",
314
- "audio_encoder.model.blocks.31.norm1.bias": "model-00001-of-00002.safetensors",
315
- "audio_encoder.model.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
316
- "audio_encoder.model.blocks.31.norm2.bias": "model-00001-of-00002.safetensors",
317
- "audio_encoder.model.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
318
- "audio_encoder.model.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
319
- "audio_encoder.model.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
320
- "audio_encoder.model.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
321
- "audio_encoder.model.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
322
- "audio_encoder.model.blocks.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
323
- "audio_encoder.model.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
324
- "audio_encoder.model.blocks.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
325
- "audio_encoder.model.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
326
- "audio_encoder.model.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
327
- "audio_encoder.model.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
328
- "audio_encoder.model.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
329
- "audio_encoder.model.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
330
- "audio_encoder.model.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
331
- "audio_encoder.model.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
332
- "audio_encoder.model.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
333
- "audio_encoder.model.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
334
- "audio_encoder.model.blocks.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
335
- "audio_encoder.model.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
336
- "audio_encoder.model.blocks.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
337
- "audio_encoder.model.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
338
- "audio_encoder.model.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
339
- "audio_encoder.model.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
340
- "audio_encoder.model.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
341
- "audio_encoder.model.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
342
- "audio_encoder.model.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
343
- "audio_encoder.model.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
344
- "audio_encoder.model.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
345
- "audio_encoder.model.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
346
- "audio_encoder.model.blocks.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
347
- "audio_encoder.model.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
348
- "audio_encoder.model.blocks.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
349
- "audio_encoder.model.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
350
- "audio_encoder.model.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
351
- "audio_encoder.model.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
352
- "audio_encoder.model.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
353
- "audio_encoder.model.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
354
- "audio_encoder.model.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
355
- "audio_encoder.model.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
356
- "audio_encoder.model.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
357
- "audio_encoder.model.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
358
- "audio_encoder.model.blocks.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
359
- "audio_encoder.model.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
360
- "audio_encoder.model.blocks.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
361
- "audio_encoder.model.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
362
- "audio_encoder.model.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
363
- "audio_encoder.model.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
364
- "audio_encoder.model.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
365
- "audio_encoder.model.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
366
- "audio_encoder.model.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
367
- "audio_encoder.model.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
368
- "audio_encoder.model.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
369
- "audio_encoder.model.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
370
- "audio_encoder.model.blocks.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
371
- "audio_encoder.model.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
372
- "audio_encoder.model.blocks.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
373
- "audio_encoder.model.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
374
- "audio_encoder.model.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
375
- "audio_encoder.model.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
376
- "audio_encoder.model.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
377
- "audio_encoder.model.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
378
- "audio_encoder.model.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
379
- "audio_encoder.model.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
380
- "audio_encoder.model.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
381
- "audio_encoder.model.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
382
- "audio_encoder.model.blocks.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
383
- "audio_encoder.model.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
384
- "audio_encoder.model.blocks.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
385
- "audio_encoder.model.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
386
- "audio_encoder.model.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
387
- "audio_encoder.model.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
388
- "audio_encoder.model.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
389
- "audio_encoder.model.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
390
- "audio_encoder.model.freq_pos_embed": "model-00001-of-00002.safetensors",
391
- "audio_encoder.model.front_end.0.mel_scale.fb": "model-00001-of-00002.safetensors",
392
- "audio_encoder.model.front_end.0.spectrogram.window": "model-00001-of-00002.safetensors",
393
- "audio_encoder.model.init_bn.1.bias": "model-00001-of-00002.safetensors",
394
- "audio_encoder.model.init_bn.1.num_batches_tracked": "model-00001-of-00002.safetensors",
395
- "audio_encoder.model.init_bn.1.running_mean": "model-00001-of-00002.safetensors",
396
- "audio_encoder.model.init_bn.1.running_var": "model-00001-of-00002.safetensors",
397
- "audio_encoder.model.init_bn.1.weight": "model-00001-of-00002.safetensors",
398
- "audio_encoder.model.norm.bias": "model-00001-of-00002.safetensors",
399
- "audio_encoder.model.norm.weight": "model-00001-of-00002.safetensors",
400
- "audio_encoder.model.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
401
- "audio_encoder.model.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
402
- "audio_encoder.model.time_pos_embed": "model-00001-of-00002.safetensors",
403
  "audio_projector.net.0.bias": "model-00002-of-00002.safetensors",
404
  "audio_projector.net.0.weight": "model-00002-of-00002.safetensors",
405
  "audio_projector.net.2.bias": "model-00002-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 9385880844
4
  },
5
  "weight_map": {
6
+ "audio_encoder.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
7
+ "audio_encoder.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
8
+ "audio_encoder.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
9
+ "audio_encoder.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
10
+ "audio_encoder.blocks.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
11
+ "audio_encoder.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
12
+ "audio_encoder.blocks.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
13
+ "audio_encoder.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
14
+ "audio_encoder.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
15
+ "audio_encoder.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
16
+ "audio_encoder.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
17
+ "audio_encoder.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
18
+ "audio_encoder.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
19
+ "audio_encoder.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
20
+ "audio_encoder.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
21
+ "audio_encoder.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
22
+ "audio_encoder.blocks.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
23
+ "audio_encoder.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
24
+ "audio_encoder.blocks.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
25
+ "audio_encoder.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
26
+ "audio_encoder.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
27
+ "audio_encoder.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
28
+ "audio_encoder.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
29
+ "audio_encoder.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
30
+ "audio_encoder.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
31
+ "audio_encoder.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
32
+ "audio_encoder.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
33
+ "audio_encoder.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
34
+ "audio_encoder.blocks.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
35
+ "audio_encoder.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
36
+ "audio_encoder.blocks.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
37
+ "audio_encoder.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
38
+ "audio_encoder.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
39
+ "audio_encoder.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
40
+ "audio_encoder.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
41
+ "audio_encoder.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
42
+ "audio_encoder.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
43
+ "audio_encoder.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
44
+ "audio_encoder.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
45
+ "audio_encoder.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
46
+ "audio_encoder.blocks.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
47
+ "audio_encoder.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
48
+ "audio_encoder.blocks.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
49
+ "audio_encoder.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
50
+ "audio_encoder.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
51
+ "audio_encoder.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
52
+ "audio_encoder.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
53
+ "audio_encoder.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
54
+ "audio_encoder.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
55
+ "audio_encoder.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
56
+ "audio_encoder.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
57
+ "audio_encoder.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
58
+ "audio_encoder.blocks.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
59
+ "audio_encoder.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
60
+ "audio_encoder.blocks.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
61
+ "audio_encoder.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
62
+ "audio_encoder.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
63
+ "audio_encoder.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
64
+ "audio_encoder.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
65
+ "audio_encoder.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
66
+ "audio_encoder.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
67
+ "audio_encoder.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
68
+ "audio_encoder.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
69
+ "audio_encoder.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
70
+ "audio_encoder.blocks.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
71
+ "audio_encoder.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
72
+ "audio_encoder.blocks.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
73
+ "audio_encoder.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
74
+ "audio_encoder.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
75
+ "audio_encoder.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
76
+ "audio_encoder.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
77
+ "audio_encoder.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
78
+ "audio_encoder.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
79
+ "audio_encoder.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
80
+ "audio_encoder.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
81
+ "audio_encoder.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
82
+ "audio_encoder.blocks.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
83
+ "audio_encoder.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
84
+ "audio_encoder.blocks.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
85
+ "audio_encoder.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
86
+ "audio_encoder.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
87
+ "audio_encoder.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
88
+ "audio_encoder.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
89
+ "audio_encoder.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
90
+ "audio_encoder.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
91
+ "audio_encoder.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
92
+ "audio_encoder.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
93
+ "audio_encoder.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
94
+ "audio_encoder.blocks.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
95
+ "audio_encoder.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
96
+ "audio_encoder.blocks.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
97
+ "audio_encoder.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
98
+ "audio_encoder.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
99
+ "audio_encoder.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
100
+ "audio_encoder.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
101
+ "audio_encoder.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
102
+ "audio_encoder.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
103
+ "audio_encoder.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
104
+ "audio_encoder.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
105
+ "audio_encoder.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
106
+ "audio_encoder.blocks.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
107
+ "audio_encoder.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
108
+ "audio_encoder.blocks.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
109
+ "audio_encoder.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
110
+ "audio_encoder.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
111
+ "audio_encoder.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
112
+ "audio_encoder.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
113
+ "audio_encoder.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
114
+ "audio_encoder.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
115
+ "audio_encoder.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
116
+ "audio_encoder.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
117
+ "audio_encoder.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
118
+ "audio_encoder.blocks.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
119
+ "audio_encoder.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
120
+ "audio_encoder.blocks.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
121
+ "audio_encoder.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
122
+ "audio_encoder.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
123
+ "audio_encoder.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
124
+ "audio_encoder.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
125
+ "audio_encoder.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
126
+ "audio_encoder.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
127
+ "audio_encoder.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
128
+ "audio_encoder.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
129
+ "audio_encoder.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
130
+ "audio_encoder.blocks.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
131
+ "audio_encoder.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
132
+ "audio_encoder.blocks.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
133
+ "audio_encoder.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
134
+ "audio_encoder.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
135
+ "audio_encoder.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
136
+ "audio_encoder.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
137
+ "audio_encoder.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
138
+ "audio_encoder.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
139
+ "audio_encoder.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
140
+ "audio_encoder.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
141
+ "audio_encoder.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
142
+ "audio_encoder.blocks.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
143
+ "audio_encoder.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
144
+ "audio_encoder.blocks.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
145
+ "audio_encoder.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
146
+ "audio_encoder.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
147
+ "audio_encoder.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
148
+ "audio_encoder.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
149
+ "audio_encoder.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
150
+ "audio_encoder.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
151
+ "audio_encoder.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
152
+ "audio_encoder.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
153
+ "audio_encoder.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
154
+ "audio_encoder.blocks.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
155
+ "audio_encoder.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
156
+ "audio_encoder.blocks.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
157
+ "audio_encoder.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
158
+ "audio_encoder.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
159
+ "audio_encoder.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
160
+ "audio_encoder.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
161
+ "audio_encoder.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
162
+ "audio_encoder.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
163
+ "audio_encoder.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
164
+ "audio_encoder.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
165
+ "audio_encoder.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
166
+ "audio_encoder.blocks.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
167
+ "audio_encoder.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
168
+ "audio_encoder.blocks.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
169
+ "audio_encoder.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
170
+ "audio_encoder.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
171
+ "audio_encoder.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
172
+ "audio_encoder.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
173
+ "audio_encoder.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
174
+ "audio_encoder.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
175
+ "audio_encoder.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
176
+ "audio_encoder.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
177
+ "audio_encoder.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
178
+ "audio_encoder.blocks.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
179
+ "audio_encoder.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
180
+ "audio_encoder.blocks.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
181
+ "audio_encoder.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
182
+ "audio_encoder.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
183
+ "audio_encoder.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
184
+ "audio_encoder.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
185
+ "audio_encoder.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
186
+ "audio_encoder.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
187
+ "audio_encoder.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
188
+ "audio_encoder.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
189
+ "audio_encoder.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
190
+ "audio_encoder.blocks.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
191
+ "audio_encoder.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
192
+ "audio_encoder.blocks.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
193
+ "audio_encoder.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
194
+ "audio_encoder.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
195
+ "audio_encoder.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
196
+ "audio_encoder.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
197
+ "audio_encoder.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
198
+ "audio_encoder.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
199
+ "audio_encoder.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
200
+ "audio_encoder.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
201
+ "audio_encoder.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
202
+ "audio_encoder.blocks.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
203
+ "audio_encoder.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
204
+ "audio_encoder.blocks.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
205
+ "audio_encoder.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
206
+ "audio_encoder.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
207
+ "audio_encoder.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
208
+ "audio_encoder.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
209
+ "audio_encoder.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
210
+ "audio_encoder.blocks.24.attn.proj.bias": "model-00001-of-00002.safetensors",
211
+ "audio_encoder.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
212
+ "audio_encoder.blocks.24.attn.qkv.bias": "model-00001-of-00002.safetensors",
213
+ "audio_encoder.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
214
+ "audio_encoder.blocks.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
215
+ "audio_encoder.blocks.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
216
+ "audio_encoder.blocks.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
217
+ "audio_encoder.blocks.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
218
+ "audio_encoder.blocks.24.norm1.bias": "model-00001-of-00002.safetensors",
219
+ "audio_encoder.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
220
+ "audio_encoder.blocks.24.norm2.bias": "model-00001-of-00002.safetensors",
221
+ "audio_encoder.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
222
+ "audio_encoder.blocks.25.attn.proj.bias": "model-00001-of-00002.safetensors",
223
+ "audio_encoder.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
224
+ "audio_encoder.blocks.25.attn.qkv.bias": "model-00001-of-00002.safetensors",
225
+ "audio_encoder.blocks.25.attn.qkv.weight": "model-00001-of-00002.safetensors",
226
+ "audio_encoder.blocks.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
227
+ "audio_encoder.blocks.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
228
+ "audio_encoder.blocks.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
229
+ "audio_encoder.blocks.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
230
+ "audio_encoder.blocks.25.norm1.bias": "model-00001-of-00002.safetensors",
231
+ "audio_encoder.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
232
+ "audio_encoder.blocks.25.norm2.bias": "model-00001-of-00002.safetensors",
233
+ "audio_encoder.blocks.25.norm2.weight": "model-00001-of-00002.safetensors",
234
+ "audio_encoder.blocks.26.attn.proj.bias": "model-00001-of-00002.safetensors",
235
+ "audio_encoder.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
236
+ "audio_encoder.blocks.26.attn.qkv.bias": "model-00001-of-00002.safetensors",
237
+ "audio_encoder.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
238
+ "audio_encoder.blocks.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
239
+ "audio_encoder.blocks.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
240
+ "audio_encoder.blocks.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
241
+ "audio_encoder.blocks.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
242
+ "audio_encoder.blocks.26.norm1.bias": "model-00001-of-00002.safetensors",
243
+ "audio_encoder.blocks.26.norm1.weight": "model-00001-of-00002.safetensors",
244
+ "audio_encoder.blocks.26.norm2.bias": "model-00001-of-00002.safetensors",
245
+ "audio_encoder.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
246
+ "audio_encoder.blocks.27.attn.proj.bias": "model-00001-of-00002.safetensors",
247
+ "audio_encoder.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
248
+ "audio_encoder.blocks.27.attn.qkv.bias": "model-00001-of-00002.safetensors",
249
+ "audio_encoder.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
250
+ "audio_encoder.blocks.27.mlp.fc1.bias": "model-00001-of-00002.safetensors",
251
+ "audio_encoder.blocks.27.mlp.fc1.weight": "model-00001-of-00002.safetensors",
252
+ "audio_encoder.blocks.27.mlp.fc2.bias": "model-00001-of-00002.safetensors",
253
+ "audio_encoder.blocks.27.mlp.fc2.weight": "model-00001-of-00002.safetensors",
254
+ "audio_encoder.blocks.27.norm1.bias": "model-00001-of-00002.safetensors",
255
+ "audio_encoder.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
256
+ "audio_encoder.blocks.27.norm2.bias": "model-00001-of-00002.safetensors",
257
+ "audio_encoder.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
258
+ "audio_encoder.blocks.28.attn.proj.bias": "model-00001-of-00002.safetensors",
259
+ "audio_encoder.blocks.28.attn.proj.weight": "model-00001-of-00002.safetensors",
260
+ "audio_encoder.blocks.28.attn.qkv.bias": "model-00001-of-00002.safetensors",
261
+ "audio_encoder.blocks.28.attn.qkv.weight": "model-00001-of-00002.safetensors",
262
+ "audio_encoder.blocks.28.mlp.fc1.bias": "model-00001-of-00002.safetensors",
263
+ "audio_encoder.blocks.28.mlp.fc1.weight": "model-00001-of-00002.safetensors",
264
+ "audio_encoder.blocks.28.mlp.fc2.bias": "model-00001-of-00002.safetensors",
265
+ "audio_encoder.blocks.28.mlp.fc2.weight": "model-00001-of-00002.safetensors",
266
+ "audio_encoder.blocks.28.norm1.bias": "model-00001-of-00002.safetensors",
267
+ "audio_encoder.blocks.28.norm1.weight": "model-00001-of-00002.safetensors",
268
+ "audio_encoder.blocks.28.norm2.bias": "model-00001-of-00002.safetensors",
269
+ "audio_encoder.blocks.28.norm2.weight": "model-00001-of-00002.safetensors",
270
+ "audio_encoder.blocks.29.attn.proj.bias": "model-00001-of-00002.safetensors",
271
+ "audio_encoder.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
272
+ "audio_encoder.blocks.29.attn.qkv.bias": "model-00001-of-00002.safetensors",
273
+ "audio_encoder.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
274
+ "audio_encoder.blocks.29.mlp.fc1.bias": "model-00001-of-00002.safetensors",
275
+ "audio_encoder.blocks.29.mlp.fc1.weight": "model-00001-of-00002.safetensors",
276
+ "audio_encoder.blocks.29.mlp.fc2.bias": "model-00001-of-00002.safetensors",
277
+ "audio_encoder.blocks.29.mlp.fc2.weight": "model-00001-of-00002.safetensors",
278
+ "audio_encoder.blocks.29.norm1.bias": "model-00001-of-00002.safetensors",
279
+ "audio_encoder.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
280
+ "audio_encoder.blocks.29.norm2.bias": "model-00001-of-00002.safetensors",
281
+ "audio_encoder.blocks.29.norm2.weight": "model-00001-of-00002.safetensors",
282
+ "audio_encoder.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
283
+ "audio_encoder.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
284
+ "audio_encoder.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
285
+ "audio_encoder.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
286
+ "audio_encoder.blocks.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
287
+ "audio_encoder.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
288
+ "audio_encoder.blocks.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
289
+ "audio_encoder.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
290
+ "audio_encoder.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
291
+ "audio_encoder.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
292
+ "audio_encoder.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
293
+ "audio_encoder.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
294
+ "audio_encoder.blocks.30.attn.proj.bias": "model-00001-of-00002.safetensors",
295
+ "audio_encoder.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
296
+ "audio_encoder.blocks.30.attn.qkv.bias": "model-00001-of-00002.safetensors",
297
+ "audio_encoder.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
298
+ "audio_encoder.blocks.30.mlp.fc1.bias": "model-00001-of-00002.safetensors",
299
+ "audio_encoder.blocks.30.mlp.fc1.weight": "model-00001-of-00002.safetensors",
300
+ "audio_encoder.blocks.30.mlp.fc2.bias": "model-00001-of-00002.safetensors",
301
+ "audio_encoder.blocks.30.mlp.fc2.weight": "model-00001-of-00002.safetensors",
302
+ "audio_encoder.blocks.30.norm1.bias": "model-00001-of-00002.safetensors",
303
+ "audio_encoder.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
304
+ "audio_encoder.blocks.30.norm2.bias": "model-00001-of-00002.safetensors",
305
+ "audio_encoder.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
306
+ "audio_encoder.blocks.31.attn.proj.bias": "model-00001-of-00002.safetensors",
307
+ "audio_encoder.blocks.31.attn.proj.weight": "model-00001-of-00002.safetensors",
308
+ "audio_encoder.blocks.31.attn.qkv.bias": "model-00001-of-00002.safetensors",
309
+ "audio_encoder.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
310
+ "audio_encoder.blocks.31.mlp.fc1.bias": "model-00001-of-00002.safetensors",
311
+ "audio_encoder.blocks.31.mlp.fc1.weight": "model-00001-of-00002.safetensors",
312
+ "audio_encoder.blocks.31.mlp.fc2.bias": "model-00001-of-00002.safetensors",
313
+ "audio_encoder.blocks.31.mlp.fc2.weight": "model-00001-of-00002.safetensors",
314
+ "audio_encoder.blocks.31.norm1.bias": "model-00001-of-00002.safetensors",
315
+ "audio_encoder.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
316
+ "audio_encoder.blocks.31.norm2.bias": "model-00001-of-00002.safetensors",
317
+ "audio_encoder.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
318
+ "audio_encoder.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
319
+ "audio_encoder.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
320
+ "audio_encoder.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
321
+ "audio_encoder.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
322
+ "audio_encoder.blocks.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
323
+ "audio_encoder.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
324
+ "audio_encoder.blocks.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
325
+ "audio_encoder.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
326
+ "audio_encoder.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
327
+ "audio_encoder.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
328
+ "audio_encoder.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
329
+ "audio_encoder.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
330
+ "audio_encoder.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
331
+ "audio_encoder.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
332
+ "audio_encoder.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
333
+ "audio_encoder.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
334
+ "audio_encoder.blocks.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
335
+ "audio_encoder.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
336
+ "audio_encoder.blocks.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
337
+ "audio_encoder.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
338
+ "audio_encoder.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
339
+ "audio_encoder.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
340
+ "audio_encoder.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
341
+ "audio_encoder.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
342
+ "audio_encoder.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
343
+ "audio_encoder.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
344
+ "audio_encoder.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
345
+ "audio_encoder.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
346
+ "audio_encoder.blocks.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
347
+ "audio_encoder.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
348
+ "audio_encoder.blocks.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
349
+ "audio_encoder.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
350
+ "audio_encoder.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
351
+ "audio_encoder.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
352
+ "audio_encoder.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
353
+ "audio_encoder.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
354
+ "audio_encoder.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
355
+ "audio_encoder.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
356
+ "audio_encoder.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
357
+ "audio_encoder.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
358
+ "audio_encoder.blocks.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
359
+ "audio_encoder.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
360
+ "audio_encoder.blocks.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
361
+ "audio_encoder.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
362
+ "audio_encoder.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
363
+ "audio_encoder.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
364
+ "audio_encoder.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
365
+ "audio_encoder.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
366
+ "audio_encoder.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
367
+ "audio_encoder.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
368
+ "audio_encoder.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
369
+ "audio_encoder.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
370
+ "audio_encoder.blocks.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
371
+ "audio_encoder.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
372
+ "audio_encoder.blocks.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
373
+ "audio_encoder.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
374
+ "audio_encoder.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
375
+ "audio_encoder.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
376
+ "audio_encoder.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
377
+ "audio_encoder.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
378
+ "audio_encoder.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
379
+ "audio_encoder.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
380
+ "audio_encoder.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
381
+ "audio_encoder.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
382
+ "audio_encoder.blocks.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
383
+ "audio_encoder.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
384
+ "audio_encoder.blocks.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
385
+ "audio_encoder.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
386
+ "audio_encoder.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
387
+ "audio_encoder.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
388
+ "audio_encoder.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
389
+ "audio_encoder.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
390
+ "audio_encoder.freq_pos_embed": "model-00001-of-00002.safetensors",
391
+ "audio_encoder.front_end.0.mel_scale.fb": "model-00001-of-00002.safetensors",
392
+ "audio_encoder.front_end.0.spectrogram.window": "model-00001-of-00002.safetensors",
393
+ "audio_encoder.init_bn.1.bias": "model-00001-of-00002.safetensors",
394
+ "audio_encoder.init_bn.1.num_batches_tracked": "model-00001-of-00002.safetensors",
395
+ "audio_encoder.init_bn.1.running_mean": "model-00001-of-00002.safetensors",
396
+ "audio_encoder.init_bn.1.running_var": "model-00001-of-00002.safetensors",
397
+ "audio_encoder.init_bn.1.weight": "model-00001-of-00002.safetensors",
398
+ "audio_encoder.norm.bias": "model-00001-of-00002.safetensors",
399
+ "audio_encoder.norm.weight": "model-00001-of-00002.safetensors",
400
+ "audio_encoder.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
401
+ "audio_encoder.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
402
+ "audio_encoder.time_pos_embed": "model-00001-of-00002.safetensors",
403
  "audio_projector.net.0.bias": "model-00002-of-00002.safetensors",
404
  "audio_projector.net.0.weight": "model-00002-of-00002.safetensors",
405
  "audio_projector.net.2.bias": "model-00002-of-00002.safetensors",
modeling_midashenglm.py CHANGED
@@ -1,50 +1,22 @@
1
  import collections.abc
 
2
  from functools import partial
3
- from typing import Any, Callable, Iterable, Literal, Optional, Tuple, Type, Union
4
 
5
  import torch
6
  import torch.nn as nn
7
  import torchaudio.transforms as audio_transforms
8
  from torch import Tensor
9
- from transformers import PreTrainedModel
 
 
 
 
 
 
 
10
 
11
- from .configuration_midashenglm import MiAudioLLMHFConfig
12
-
13
-
14
- class AudioProjectorSubsample(torch.nn.Module):
15
- def __init__(self, in_dim: int, out_dim: int, downsample_rate=5):
16
- super().__init__()
17
- self.k = downsample_rate
18
- self.net = torch.nn.Sequential(
19
- torch.nn.Linear(in_dim * self.k, out_dim),
20
- torch.nn.GELU(),
21
- torch.nn.Linear(out_dim, out_dim),
22
- )
23
-
24
- def forward(self, x, mask=None):
25
- """
26
- inputs is the output of audio encoder.
27
- :param x: [B, T, D]
28
- :param x_lens: [B, T]
29
- :return: [B, T', D']
30
- """
31
- batch_size, seq_len, dim = x.shape
32
- num_frames_to_discard = seq_len % self.k
33
- if num_frames_to_discard > 0:
34
- x = x[:, :-num_frames_to_discard, :]
35
- if mask is not None:
36
- mask = mask[:, :-num_frames_to_discard]
37
- if mask is None:
38
- mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
39
- x = x.reshape(
40
- batch_size, -1, self.k * dim
41
- ) # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
42
- x = self.net(x)
43
- mask = mask.reshape(
44
- batch_size, -1, self.k
45
- ) # rearrange(mask, "b (s k) -> b s k", k=self.k)
46
- mask = mask.any(dim=-1).long()
47
- return x, mask
48
 
49
 
50
  # The functions `drop_path` and the module `DropPath` are taken from timm
@@ -144,7 +116,7 @@ class Mlp(nn.Module):
144
  in_features: int,
145
  hidden_features: Optional[int] = None,
146
  out_features: Optional[int] = None,
147
- act_layer: Type[torch.nn.Module] = nn.GELU,
148
  drop: float = 0.0,
149
  ):
150
  super().__init__()
@@ -238,11 +210,11 @@ class Block(nn.Module):
238
  qkv_bias: bool = False,
239
  drop: float = 0.0,
240
  attn_drop: float = 0.0,
241
- init_values=None,
242
  drop_path: float = 0.0,
243
- act_layer: Type[torch.nn.Module] = nn.GELU,
244
- norm_layer: Type[torch.nn.Module] = nn.LayerNorm,
245
- attention_type: Type[torch.nn.Module] = Attention,
246
  ):
247
  super().__init__()
248
  self.norm1 = norm_layer(dim)
@@ -277,6 +249,7 @@ class Block(nn.Module):
277
  return x
278
 
279
 
 
280
  class RearranceReplace(nn.Module):
281
  def forward(self, x: torch.Tensor) -> torch.Tensor:
282
  # rearrange(x, "b c f t -> b f c t")
@@ -288,69 +261,23 @@ class RearranceReplace(nn.Module):
288
  class AudioTransformer(nn.Module):
289
  def __init__(
290
  self,
291
- outputdim: int = 527,
292
- patch_size: Union[int, Tuple[int, int]] = 16,
293
- patch_stride: Union[int, Tuple[int, int]] = 16,
294
- embed_dim: int = 768,
295
- depth: int = 12,
296
- num_heads: int = 12,
297
- mlp_ratio: float = 4.0,
298
- qkv_bias: bool = True,
299
- drop_rate: float = 0.0,
300
- attn_drop_rate: float = 0.0,
301
- drop_path_rate: float = 0.0,
302
- norm_layer: torch.nn.Module | None = None,
303
- act_layer: Type[torch.nn.Module] = nn.GELU,
304
- init_values=None,
305
- target_length: int = 1012,
306
- input_channels: int = 1,
307
- pooling: Literal["mean", "token", "dm", "logit", "cat"] | None = "token",
308
- time_patch_out: float | None = None,
309
- freq_patch_out: float | None = None,
310
- block_type: Type[torch.nn.Module] = Block,
311
- attention_type: Type[torch.nn.Module] = Attention,
312
- eval_avg: Literal["mean", "max", "cat"] = "mean",
313
- n_mels: int = 64,
314
- n_fft: int = 512,
315
- hop_size: int = 160,
316
- win_size: int = 512,
317
- f_min: float = 0.0,
318
- f_max: float = 8000.0,
319
- sample_rate: int = 16000,
320
- center: bool = True,
321
- pad_last: bool = True,
322
  ):
323
  super().__init__()
324
- assert pooling in ("mean", "token", "dm", "logit", "cat", None)
325
- self.outputdim = outputdim
326
- self.pooling = pooling
327
- self.embed_dim = embed_dim
328
- self.patch_stride = patch_stride
329
- self.patch_size = patch_size
330
- self.n_mels = n_mels
331
- self.n_fft = n_fft
332
- self.hop_size = hop_size
333
- self.win_size = win_size
334
- self.f_min = f_min
335
- self.f_max = f_max
336
- self.sample_rate = sample_rate
337
- self.center = center
338
- self.pad_last = pad_last
339
- self.input_channels = input_channels
340
- self.eval_avg = eval_avg
341
- self.time_patch_out = time_patch_out
342
- self.freq_patch_out = freq_patch_out
343
 
344
  self.front_end = nn.Sequential(
345
  audio_transforms.MelSpectrogram(
346
- f_min=self.f_min,
347
- f_max=self.f_max,
348
- center=self.center,
349
- win_length=self.win_size,
350
- hop_length=self.hop_size,
351
- sample_rate=self.sample_rate,
352
- n_fft=self.n_fft,
353
- n_mels=self.n_mels,
354
  ),
355
  audio_transforms.AmplitudeToDB(top_db=120),
356
  )
@@ -358,62 +285,47 @@ class AudioTransformer(nn.Module):
358
  self.init_bn = nn.Sequential(
359
  # Rearrange("b c f t -> b f c t"),
360
  RearranceReplace(),
361
- torch.nn.BatchNorm2d(self.n_mels, momentum=0.01),
362
  # Rearrange("b f c t -> b c f t"),
363
  RearranceReplace(),
364
  )
365
 
366
- self.target_length = target_length
367
-
368
- patch_stride = to_2tuple(self.patch_stride)[-1]
369
- # Allowed length in number of frames, otherwise the positional embedding will throw an error
370
- self.maximal_allowed_length = self.target_length
371
-
372
  self.patch_embed = AudioPatchEmbed(
373
- input_size=(self.n_mels, target_length),
374
- embed_dim=self.embed_dim,
375
- in_chans=self.input_channels,
376
- patch_size=self.patch_size,
377
  flatten=False,
378
- patch_stride=self.patch_stride,
379
  )
380
 
381
- if self.pooling == "token":
382
- self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
383
- self.token_pos_embed = nn.Parameter(torch.randn(1, embed_dim) * 0.02)
384
-
385
  self.time_pos_embed = nn.Parameter(
386
- torch.randn(1, embed_dim, 1, self.patch_embed.grid_size[1]) * 0.02
387
  )
388
  self.freq_pos_embed = nn.Parameter(
389
- torch.randn(1, embed_dim, self.patch_embed.grid_size[0], 1) * 0.02
390
  )
391
 
392
- norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
393
- act_layer = act_layer or nn.GELU
394
  dpr = [
395
- x.item() for x in torch.linspace(0, drop_path_rate, depth)
396
  ] # stochastic depth decay rule
397
- self.pos_drop = nn.Dropout(p=drop_rate)
398
  self.blocks = nn.ModuleList(
399
- block_type(
400
- dim=embed_dim,
401
- num_heads=num_heads,
402
- mlp_ratio=mlp_ratio,
403
- qkv_bias=qkv_bias,
404
- init_values=init_values,
405
- drop=drop_rate,
406
- attn_drop=attn_drop_rate,
407
  drop_path=dpr[i],
408
  norm_layer=norm_layer,
409
- act_layer=act_layer,
410
- attention_type=attention_type,
411
  )
412
- for i in range(depth)
413
  )
414
- self.norm = norm_layer(embed_dim)
415
- if hasattr(self, "cls_token") and self.cls_token is not None:
416
- nn.init.normal_(self.cls_token, std=1e-6)
417
 
418
  def forward_features(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
419
  t = x.shape[-1]
@@ -424,119 +336,23 @@ class AudioTransformer(nn.Module):
424
  x = torch.permute(
425
  torch.flatten(x, 2, 3), (0, 2, 1)
426
  ) # rearrange(x, "b c f t -> b (f t) c")
427
- if self.pooling == "token":
428
- cls_token = self.cls_token.expand(x.shape[0], -1, -1)
429
- cls_token = cls_token + self.token_pos_embed
430
- x = torch.cat((cls_token, x), dim=1)
431
  x = self.pos_drop(x)
432
  for block in self.blocks:
433
  x = block(x, **kwargs)
434
  x = self.norm(x)
435
  return x
436
 
437
- # TODO
438
- # ================ 从此行开始,与Dasheng代码严重分歧 ================
439
-
440
- def forward_head(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
441
- mask = kwargs.get("mask", None)
442
- if self.pooling == "token":
443
- x = x[:, 0]
444
- return x.sigmoid()
445
- elif self.pooling == "mean":
446
- if mask is not None:
447
- m = (1.0 - mask.float()).unsqueeze(-1) # 1.0 means is masked
448
- x = torch.nan_to_num((x * m).sum(1) / m.sum(1))
449
- else:
450
- x = x.mean(1)
451
- return x.sigmoid()
452
- elif self.pooling == "logit":
453
- if mask is not None:
454
- m = (1.0 - mask.float()).unsqueeze(-1) # 1.0 means is masked
455
- x = torch.nan_to_num((x * m).sum(1) / m.sum(1))
456
- else:
457
- x = x.mean(1)
458
- return x
459
- elif self.pooling == "dm":
460
- # Unpack using the frequency dimension, which is constant
461
- b, _, d = x.shape
462
- x = x.reshape(
463
- b, -1, self.patch_embed.grid_size[0], d
464
- ) # rearrange(x, "b (f t) d -> b f t d")
465
- # First poolin frequency, then sigmoid the (B T D) output
466
- x = (x.mean(1)).sigmoid()
467
- return x.mean(1)
468
- elif self.pooling is None:
469
- return x
470
- else:
471
- return x.mean(1)
472
-
473
- def _audiosample_to_mellength(self, lengths: torch.Tensor) -> torch.Tensor:
474
- if self.center:
475
- lengths = lengths + self.win_size
476
- lengths = 1 + ((lengths - self.win_size) / self.hop_size).long()
477
- return lengths
478
-
479
- # Calculates the number of patches for a given length in audio-samples
480
- # For example : torch.Tensor([16000]) will return 250 for Dasheng
481
- def _audiosample_to_patchlength(self, lengths: torch.Tensor) -> torch.Tensor:
482
- lengths = self._audiosample_to_mellength(lengths)
483
- return self._frames_to_patchlength(lengths)
484
-
485
- # Calcualtes the same as above but for a spectrogram input
486
- # i.e., [100] will return 25 for Dasheng
487
- def _frames_to_patchlength(self, lengths: torch.Tensor) -> torch.Tensor:
488
- patch_stride = to_2tuple(self.patch_stride)
489
- patch_size = to_2tuple(self.patch_size)
490
- frequency_patch_size = self.n_mels // patch_stride[0]
491
- time_patch_size = patch_stride[1]
492
- time_window_size = patch_size[1]
493
- number_of_tokens = (
494
- torch.floor((lengths - time_window_size) / time_patch_size) + 1
495
- ) * frequency_patch_size
496
- if self.pooling == "token":
497
- number_of_tokens += 1
498
- return number_of_tokens
499
-
500
- # Note that we use (... t f) -> (f t) here, meaning that patches are ordered as:
501
- # 0 4 -> 0 4 1 5 2 6 3 7
502
- # 1 5
503
- # 2 6
504
- # 3 7
505
- # This function does the (t f) -> (f t) transform
506
- def _reshape_mask_to_ft_format(self, mask: torch.Tensor) -> torch.Tensor:
507
- n_freq_patches = self.n_mels // to_2tuple(self.patch_stride)[0]
508
- mask = (
509
- mask.reshape(-1, n_freq_patches)
510
- .transpose(-2, -1)
511
- .flatten(-2)
512
- .reshape_as(mask)
513
- )
514
- return mask
515
-
516
- def _to_binary_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
517
- batch_size = len(lengths)
518
- lengths = self._audiosample_to_patchlength(lengths)
519
- idx = torch.arange(max_length, device=lengths.device)
520
- idx = idx.repeat(batch_size).view(batch_size, max_length)
521
- mask = (idx >= lengths.unsqueeze(-1)).bool()
522
- return mask
523
-
524
  def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
525
  batch_size = len(lengths)
526
  idx = torch.arange(max_length, device=lengths.device)
527
  idx = idx.repeat(batch_size).view(batch_size, max_length)
528
- mask = (idx >= lengths.unsqueeze(-1)).bool()
529
  return mask
530
 
531
- def _create_mask(self, x_length, audio_length_in_spec_frames: int):
532
- max_length_in_patches = self._frames_to_patchlength(
533
- torch.tensor(audio_length_in_spec_frames)
534
- )
535
- mask_1d = self._to_binary_mask(x_length, max_length=int(max_length_in_patches))
536
- return mask_1d
537
-
538
  def forward(
539
- self, x: torch.Tensor, x_length: Optional[torch.Tensor] = None
 
 
540
  ) -> torch.Tensor:
541
  x = self.front_end(x)
542
  target_length_in_patches = self.target_length // 4
@@ -547,109 +363,120 @@ class AudioTransformer(nn.Module):
547
  t = x.shape[-1]
548
 
549
  input_splits = x.split(target_length_in_patches, dim=-1)
550
- mask = None # Single mask
551
- masks = [None for _ in range(len(input_splits))]
552
 
553
  if x_length is not None:
554
  assert len(x_length) == len(x), (
555
  "batchsizes of input x and x_length need to be same"
556
  )
557
  assert x_length.ndim == 1, "Lengths are of size (B,)"
558
- scaled_lengths = (
559
- x_length / (self.hop_size * 4)
560
- ).long() # 40ms for all dasheng models
561
- # Note that the mask is in (t f) format, but transformers here use (f t) format
562
- mask = self._to_mask(
563
- max_length=t,
564
- lengths=scaled_lengths,
565
- )
566
- # Trim mask to only use valid "patches", since x.shape[-1] is based on the possibly padded input
567
- masks = mask.split(target_length_in_patches, dim=-1)
568
 
569
  outputs = []
570
 
571
- for split_x, mask in zip(input_splits, masks):
572
  forward_kwargs = {}
573
- forward_kwargs["mask"] = mask
574
  split_x = self.forward_features(split_x, **forward_kwargs)
575
- split_x = self.forward_head(split_x, **forward_kwargs)
576
  outputs.append(split_x)
577
  x = torch.cat(outputs, dim=1)
578
- return x
579
 
580
 
581
- class LemonstoreWrapper(nn.Module):
582
- def __init__(
583
- self,
584
- append_cls_token: bool = False,
585
- **kwargs,
586
- ):
587
  super().__init__()
588
- self.append_cls_token = (
589
- append_cls_token # Pool all tokens to one as a "cls" token
 
 
 
590
  )
591
 
592
- model_default_kwargs = {
593
- "audiotransformer_huge.dasheng06b.10s": {
594
- "embed_dim": 1280,
595
- "depth": 32,
596
- "num_heads": 16,
597
- "pooling": "mean",
598
- "drop_path_rate": 0.0,
599
- "outputdim": 527,
600
- "patch_size": [64, 4],
601
- "patch_stride": [64, 4],
602
- "target_length": 1008,
603
- }
604
- }
605
- if "pretrained" in kwargs:
606
- del kwargs["pretrained"]
607
-
608
- create_kwargs = model_default_kwargs[kwargs.pop("model_name")]
609
- create_kwargs.update(kwargs)
610
- create_kwargs.update(
611
- pooling=None,
612
- eval_avg="cat",
613
- )
614
 
615
- self.model = AudioTransformer(**create_kwargs)
616
- self.embed_dim = self.model.embed_dim
617
 
618
- def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
619
- batch_size = len(lengths)
620
- idx = torch.arange(max_length, device=lengths.device)
621
- idx = idx.repeat(batch_size).view(batch_size, max_length)
622
- mask = (idx < lengths.unsqueeze(-1)).long()
623
- return mask
624
 
625
- def _create_encoder_attention_mask(
626
- self, model_output: torch.Tensor, input_lengths: torch.Tensor
627
- ):
628
- scaled_lengths = (
629
- input_lengths / (self.model.hop_size * 4)
630
- ).long() # 40ms for all dasheng models
631
- return self._to_mask(scaled_lengths, max_length=model_output.shape[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
 
633
  def forward(
634
  self,
635
- input: torch.Tensor,
636
- input_length: Optional[torch.Tensor] = None,
637
- return_attention_mask: bool = False,
638
- ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
639
- emb = self.model(input, input_length)
640
- # Outputs are added to multiple of 10s, remove the padded items
641
- if input_length is not None:
642
- input_length = input_length + self.model.n_fft
643
- scaled_lengths = (
644
- (1 + (input_length - self.model.n_fft) / self.model.hop_size) // 4
645
- ).long() # 40ms for all dasheng models
646
- max_length = torch.max(scaled_lengths)
647
- emb = emb[:, :max_length, :]
648
- if self.append_cls_token:
649
- emb = torch.cat([emb.mean(1, keepdims=True), emb], dim=1)
650
- if return_attention_mask and input_length is not None:
651
- return emb, self._create_encoder_attention_mask(emb, input_length)
652
- return emb
 
 
 
 
 
 
 
 
 
 
 
653
 
654
 
655
  class DashengQwen25OmniModelInstruct(PreTrainedModel):
@@ -658,98 +485,53 @@ class DashengQwen25OmniModelInstruct(PreTrainedModel):
658
  def __init__(self, config: MiAudioLLMHFConfig):
659
  super().__init__(config)
660
 
661
- audio_encoder = config.audio_encoder
662
- audio_encoder_args = config.audio_encoder_args
663
- text_model = config.text_model
664
- text_model_args = config.text_model_args
665
  freeze = config.freeze
666
  lora = config.lora
667
  subsample_factor = config.subsample_factor
668
- use_encoderattention_mask = config.use_encoderattention_mask
669
- resize_tokenizer = True
670
- force_fp32 = False
671
-
672
- from transformers.models.qwen2_5_omni import (
673
- Qwen2_5OmniProcessor,
674
- Qwen2_5OmniThinkerForConditionalGeneration,
675
- )
676
 
677
  self.subsample_factor = subsample_factor
678
  self.lora = lora
679
- self.use_encoderattention_mask = use_encoderattention_mask
680
  # Encoder part
681
- assert audio_encoder == "LemonstoreWrapper"
682
- self.audio_encoder = LemonstoreWrapper(**audio_encoder_args)
683
  assert lora != "encoder"
684
 
685
- # For some reason, torch.cuda.is_bf16_supported() does return True on V100
686
- has_bf16support = torch.cuda.get_device_capability(torch.device("cuda"))[0] > 7
687
-
688
  # decoder
689
- self.processor = Qwen2_5OmniProcessor.from_pretrained(text_model)
690
- self.tokenizer = self.processor.tokenizer
691
- self.decoder = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
692
- text_model,
693
- attn_implementation="sdpa",
694
- torch_dtype=torch.bfloat16
695
- if not force_fp32 and has_bf16support
696
- else torch.float32,
697
- **text_model_args,
698
- )
699
- del self.decoder.visual
700
- del self.decoder.audio_tower
701
- hidden_size_text = self.decoder.model.config.hidden_size
702
- # Overwrite default ForCausalLMLoss, now also support reduction
703
- special_tokens = [
704
- "<|en|>",
705
- "<|kr|>",
706
- "<|de|>",
707
- "<|es|>",
708
- "<|fr|>",
709
- "<|hi|>",
710
- "<|uk|>",
711
- "<|th|>",
712
- "<|vi|>",
713
- "<|nl|>",
714
- "<|pt|>",
715
- "<|id|>",
716
- "<|ru|>",
717
- "<|it|>",
718
- "<|ar|>",
719
- "<|jp|>",
720
- "<|unknown|>",
721
- ]
722
- self.tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
723
- if resize_tokenizer:
724
- self.decoder.model.resize_token_embeddings(len(self.tokenizer))
725
  assert lora != "decoder"
726
  assert freeze is None
727
 
728
  # audio projector
729
  self.audio_projector = AudioProjectorSubsample(
730
- self.audio_encoder.embed_dim, hidden_size_text, self.subsample_factor
 
 
731
  )
732
 
733
- def _forward_audio_encoder(self, audios, audio_length: Iterable[int] | None):
734
- encoder_out = self.audio_encoder(
735
- audios, audio_length, return_attention_mask=self.use_encoderattention_mask
736
- )
737
- encoder_atts = None
738
 
739
- if self.use_encoderattention_mask:
740
- encoder_out, encoder_atts = encoder_out
 
 
 
 
741
 
742
  # audio projector
743
  encoder_out, encoder_atts = self.audio_projector(encoder_out, encoder_atts)
744
 
745
- return encoder_out, encoder_atts
746
 
747
  def _prepare_with_input_ids(
748
- self, input_ids: torch.Tensor, audio_embeddings, audio_token_id
749
- ):
 
 
 
750
  special_mask = input_ids == audio_token_id
751
  assert audio_embeddings.shape[1] <= (special_mask.sum(-1)).max(), (
752
- "Mask and audio embeddings seem to have different sizes"
 
 
753
  )
754
  input_embeddings = self.decoder.model.embed_tokens(input_ids)
755
  audio_embeddings = audio_embeddings.to(input_embeddings.dtype)
@@ -762,85 +544,104 @@ class DashengQwen25OmniModelInstruct(PreTrainedModel):
762
 
763
  def forward(
764
  self,
765
- input_ids: Tensor,
766
- input_values: Tensor,
767
- audio_length: Iterable[int] | None,
768
- return_loss: bool = False,
769
- attention_mask: Tensor | None = None,
770
- audio_token_id: int | None = None,
 
771
  ):
772
- input_values = input_values.to(self.device)
773
- audio_encoder_hidden_states, _ = self._forward_audio_encoder(
774
- input_values, audio_length=audio_length
775
- )
776
-
777
- input_ids = input_ids.to(self.device)
778
- input_embeds = self._prepare_with_input_ids(
779
- input_ids=input_ids,
780
- audio_embeddings=audio_encoder_hidden_states,
781
- audio_token_id=audio_token_id,
782
- )
783
- input_mask = attention_mask
784
- decoder_targets = torch.nn.functional.pad(input_ids[:, 1:], (0, 1), value=-100)
785
-
786
- decoder_output = self.decoder(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
  input_ids=None,
788
- inputs_embeds=input_embeds,
789
- attention_mask=input_mask,
790
- labels=decoder_targets,
791
- return_dict=True,
792
  )
793
 
794
- if return_loss:
795
- return decoder_output.loss
796
- return decoder_output.logits
797
-
798
  def generate(
799
  self,
800
- input_ids: Tensor,
801
- input_values: Tensor,
802
- audio_length: Iterable[int] | None,
803
- use_nucleus_sampling=False,
804
- max_length=1024,
805
- top_p=1.0,
806
- top_k: int = 50,
807
- temperature: float = 1.0,
808
- repetition_penalty=1.0,
809
- return_text=True,
810
- # The following are only used by HF
811
- attention_mask: Tensor | None = None,
812
- audio_token_id: int | None = None,
813
  ):
814
- encoder_hidden_states, encoder_atts = self._forward_audio_encoder(
815
- input_values, audio_length=audio_length
816
- )
817
-
818
- input_ids = input_ids.to(self.device)
819
- input_embeds = self._prepare_with_input_ids(
820
- input_ids=input_ids,
821
- audio_embeddings=encoder_hidden_states,
822
- audio_token_id=audio_token_id,
823
- )
824
- input_mask = attention_mask
825
-
826
- outputs = self.decoder.generate(
827
- inputs_embeds=input_embeds,
828
- attention_mask=input_mask,
829
- do_sample=use_nucleus_sampling,
830
- max_new_tokens=max_length,
831
- top_p=top_p,
832
- top_k=top_k,
833
- temperature=temperature,
834
- repetition_penalty=repetition_penalty,
835
- eos_token_id=[self.tokenizer.pad_token_id, self.tokenizer.eos_token_id],
836
- pad_token_id=self.tokenizer.pad_token_id,
837
- )
838
- if not return_text:
839
- return outputs
840
- texts = self.tokenizer.batch_decode(
841
- outputs,
842
- add_special_tokens=False,
843
- skip_special_tokens=True,
844
- clean_up_tokenization_spaces=True,
 
 
 
 
 
 
 
 
 
845
  )
846
- return texts
 
1
  import collections.abc
2
+ from dataclasses import dataclass
3
  from functools import partial
4
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
5
 
6
  import torch
7
  import torch.nn as nn
8
  import torchaudio.transforms as audio_transforms
9
  from torch import Tensor
10
+ from transformers import GenerationMixin, PreTrainedModel
11
+ from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
12
+ from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
13
+ Qwen2_5OmniTextConfig,
14
+ )
15
+ from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
16
+ Qwen2_5OmniThinkerTextModel,
17
+ )
18
 
19
+ from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  # The functions `drop_path` and the module `DropPath` are taken from timm
 
116
  in_features: int,
117
  hidden_features: Optional[int] = None,
118
  out_features: Optional[int] = None,
119
+ act_layer: Type[nn.Module] = nn.GELU,
120
  drop: float = 0.0,
121
  ):
122
  super().__init__()
 
210
  qkv_bias: bool = False,
211
  drop: float = 0.0,
212
  attn_drop: float = 0.0,
213
+ init_values: float | None = None,
214
  drop_path: float = 0.0,
215
+ act_layer: Type[nn.Module] = nn.GELU,
216
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
217
+ attention_type: Type[nn.Module] = Attention,
218
  ):
219
  super().__init__()
220
  self.norm1 = norm_layer(dim)
 
249
  return x
250
 
251
 
252
+ # TODO
253
  class RearranceReplace(nn.Module):
254
  def forward(self, x: torch.Tensor) -> torch.Tensor:
255
  # rearrange(x, "b c f t -> b f c t")
 
261
  class AudioTransformer(nn.Module):
262
  def __init__(
263
  self,
264
+ config: DashengConfig,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  ):
266
  super().__init__()
267
+ self.target_length = config.target_length
268
+ self.embed_dim = config.embed_dim
269
+ self.hop_length = config.hop_length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  self.front_end = nn.Sequential(
272
  audio_transforms.MelSpectrogram(
273
+ f_min=config.f_min,
274
+ f_max=config.f_max,
275
+ center=config.center,
276
+ win_length=config.win_length,
277
+ hop_length=config.hop_length,
278
+ sample_rate=config.sample_rate,
279
+ n_fft=config.n_fft,
280
+ n_mels=config.n_mels,
281
  ),
282
  audio_transforms.AmplitudeToDB(top_db=120),
283
  )
 
285
  self.init_bn = nn.Sequential(
286
  # Rearrange("b c f t -> b f c t"),
287
  RearranceReplace(),
288
+ nn.BatchNorm2d(config.n_mels, momentum=0.01),
289
  # Rearrange("b f c t -> b c f t"),
290
  RearranceReplace(),
291
  )
292
 
 
 
 
 
 
 
293
  self.patch_embed = AudioPatchEmbed(
294
+ input_size=(config.n_mels, config.target_length),
295
+ embed_dim=config.embed_dim,
296
+ in_chans=config.input_channels,
297
+ patch_size=config.patch_size,
298
  flatten=False,
299
+ patch_stride=config.patch_stride,
300
  )
301
 
 
 
 
 
302
  self.time_pos_embed = nn.Parameter(
303
+ torch.randn(1, config.embed_dim, 1, self.patch_embed.grid_size[1]) * 0.02
304
  )
305
  self.freq_pos_embed = nn.Parameter(
306
+ torch.randn(1, config.embed_dim, self.patch_embed.grid_size[0], 1) * 0.02
307
  )
308
 
309
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
 
310
  dpr = [
311
+ x.item() for x in torch.linspace(0, config.drop_path_rate, config.depth)
312
  ] # stochastic depth decay rule
313
+ self.pos_drop = nn.Dropout(p=config.drop_rate)
314
  self.blocks = nn.ModuleList(
315
+ Block(
316
+ dim=config.embed_dim,
317
+ num_heads=config.num_heads,
318
+ mlp_ratio=config.mlp_ratio,
319
+ qkv_bias=config.qkv_bias,
320
+ init_values=config.init_values,
321
+ drop=config.drop_rate,
322
+ attn_drop=config.attn_drop_rate,
323
  drop_path=dpr[i],
324
  norm_layer=norm_layer,
 
 
325
  )
326
+ for i in range(config.depth)
327
  )
328
+ self.norm = norm_layer(config.embed_dim)
 
 
329
 
330
  def forward_features(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
331
  t = x.shape[-1]
 
336
  x = torch.permute(
337
  torch.flatten(x, 2, 3), (0, 2, 1)
338
  ) # rearrange(x, "b c f t -> b (f t) c")
 
 
 
 
339
  x = self.pos_drop(x)
340
  for block in self.blocks:
341
  x = block(x, **kwargs)
342
  x = self.norm(x)
343
  return x
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
346
  batch_size = len(lengths)
347
  idx = torch.arange(max_length, device=lengths.device)
348
  idx = idx.repeat(batch_size).view(batch_size, max_length)
349
+ mask = (idx < lengths.unsqueeze(-1)).bool()
350
  return mask
351
 
 
 
 
 
 
 
 
352
  def forward(
353
+ self,
354
+ x: torch.Tensor,
355
+ x_length: Optional[torch.Tensor] = None,
356
  ) -> torch.Tensor:
357
  x = self.front_end(x)
358
  target_length_in_patches = self.target_length // 4
 
363
  t = x.shape[-1]
364
 
365
  input_splits = x.split(target_length_in_patches, dim=-1)
 
 
366
 
367
  if x_length is not None:
368
  assert len(x_length) == len(x), (
369
  "batchsizes of input x and x_length need to be same"
370
  )
371
  assert x_length.ndim == 1, "Lengths are of size (B,)"
372
+ scaled_lengths = (x_length / (self.hop_length * 4)).long()
373
+ mask = self._to_mask(max_length=t, lengths=scaled_lengths)
374
+ split_masks = mask.logical_not().split(target_length_in_patches, dim=-1)
375
+ else:
376
+ mask = None
377
+ split_masks = [None] * len(input_splits)
 
 
 
 
378
 
379
  outputs = []
380
 
381
+ for split_x, split_mask in zip(input_splits, split_masks):
382
  forward_kwargs = {}
383
+ forward_kwargs["mask"] = split_mask
384
  split_x = self.forward_features(split_x, **forward_kwargs)
 
385
  outputs.append(split_x)
386
  x = torch.cat(outputs, dim=1)
387
+ return x, mask
388
 
389
 
390
+ class AudioProjectorSubsample(nn.Module):
391
+ def __init__(self, in_dim: int, out_dim: int, downsample_rate=5):
 
 
 
 
392
  super().__init__()
393
+ self.k = downsample_rate
394
+ self.net = nn.Sequential(
395
+ nn.Linear(in_dim * self.k, out_dim),
396
+ nn.GELU(),
397
+ nn.Linear(out_dim, out_dim),
398
  )
399
 
400
+ def forward(self, x, mask=None):
401
+ batch_size, seq_len, dim = x.shape
402
+ num_frames_to_discard = seq_len % self.k
403
+ if num_frames_to_discard > 0:
404
+ x = x[:, :-num_frames_to_discard, :]
405
+ if mask is not None:
406
+ mask = mask[:, :-num_frames_to_discard]
407
+ if mask is None:
408
+ mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
409
+ x = x.reshape(
410
+ batch_size, -1, self.k * dim
411
+ ) # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
412
+ x = self.net(x)
413
+ mask = mask.reshape(
414
+ batch_size, -1, self.k
415
+ ) # rearrange(mask, "b (s k) -> b s k", k=self.k)
416
+ mask = mask.any(dim=-1).long()
417
+ return x, mask
 
 
 
 
418
 
 
 
419
 
420
+ @dataclass
421
+ class DashengQwen25OmniModelInstructOutput(ModelOutput):
422
+ logits: torch.FloatTensor = None
423
+ past_key_values: Optional[List[torch.FloatTensor]] = None
424
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
425
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
426
 
427
+
428
+ class Decoder(PreTrainedModel, GenerationMixin):
429
+ config_class = Qwen2_5OmniTextConfig
430
+
431
+ def __init__(self, config: Qwen2_5OmniTextConfig):
432
+ super().__init__(config)
433
+ self.model = Qwen2_5OmniThinkerTextModel._from_config(
434
+ config,
435
+ attn_implementation="sdpa", # TODO
436
+ )
437
+ self.lm_head = nn.Linear(
438
+ config.hidden_size,
439
+ config.vocab_size,
440
+ bias=False,
441
+ )
442
+ # TODO fix dtype
443
+ self.lm_head.weight.data = self.lm_head.weight.data.to(
444
+ self.model.embed_tokens.weight.dtype
445
+ )
446
+ # TODO tie weight?
447
+ self.post_init()
448
 
449
  def forward(
450
  self,
451
+ return_dict: Optional[bool] = None,
452
+ **kwargs: Any,
453
+ ) -> DashengQwen25OmniModelInstructOutput:
454
+ outputs: BaseModelOutputWithPast = self.model(
455
+ return_dict=True,
456
+ **kwargs,
457
+ )
458
+ hidden_states = outputs.last_hidden_state
459
+ logits = self.lm_head(hidden_states)
460
+
461
+ if not return_dict:
462
+ return tuple(
463
+ v
464
+ for v in [
465
+ logits,
466
+ outputs.last_hidden_state,
467
+ outputs.past_key_values,
468
+ outputs.hidden_states,
469
+ outputs.attentions,
470
+ ]
471
+ if v is not None
472
+ )
473
+
474
+ return DashengQwen25OmniModelInstructOutput(
475
+ logits=logits,
476
+ past_key_values=outputs.past_key_values,
477
+ hidden_states=outputs.hidden_states,
478
+ attentions=outputs.attentions,
479
+ )
480
 
481
 
482
  class DashengQwen25OmniModelInstruct(PreTrainedModel):
 
485
  def __init__(self, config: MiAudioLLMHFConfig):
486
  super().__init__(config)
487
 
 
 
 
 
488
  freeze = config.freeze
489
  lora = config.lora
490
  subsample_factor = config.subsample_factor
 
 
 
 
 
 
 
 
491
 
492
  self.subsample_factor = subsample_factor
493
  self.lora = lora
 
494
  # Encoder part
495
+ self.audio_encoder = AudioTransformer(config.audio_encoder_config)
 
496
  assert lora != "encoder"
497
 
 
 
 
498
  # decoder
499
+ self.decoder = Decoder(config.text_model_config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  assert lora != "decoder"
501
  assert freeze is None
502
 
503
  # audio projector
504
  self.audio_projector = AudioProjectorSubsample(
505
+ self.audio_encoder.embed_dim,
506
+ config.text_model_config.hidden_size,
507
+ self.subsample_factor,
508
  )
509
 
510
+ self.post_init()
 
 
 
 
511
 
512
+ def _forward_audio_encoder(
513
+ self,
514
+ audios: torch.Tensor,
515
+ audio_length: Optional[Iterable[int]],
516
+ ) -> torch.Tensor:
517
+ encoder_out, encoder_atts = self.audio_encoder(audios, audio_length)
518
 
519
  # audio projector
520
  encoder_out, encoder_atts = self.audio_projector(encoder_out, encoder_atts)
521
 
522
+ return encoder_out
523
 
524
  def _prepare_with_input_ids(
525
+ self,
526
+ input_ids: torch.Tensor,
527
+ audio_embeddings: torch.Tensor,
528
+ audio_token_id: int,
529
+ ) -> torch.Tensor:
530
  special_mask = input_ids == audio_token_id
531
  assert audio_embeddings.shape[1] <= (special_mask.sum(-1)).max(), (
532
+ "Mask and audio embeddings seem to have different sizes: "
533
+ f"{audio_embeddings.shape=}, {special_mask=}, {input_ids=}, "
534
+ f"{audio_embeddings.shape[1]=} vs {(special_mask.sum(-1)).max()=}"
535
  )
536
  input_embeddings = self.decoder.model.embed_tokens(input_ids)
537
  audio_embeddings = audio_embeddings.to(input_embeddings.dtype)
 
544
 
545
  def forward(
546
  self,
547
+ input_ids: Optional[Tensor] = None,
548
+ input_values: Optional[Tensor] = None,
549
+ inputs_embeds: Optional[Tensor] = None,
550
+ audio_length: Optional[Iterable[int]] = None,
551
+ attention_mask: Optional[Tensor] = None,
552
+ audio_token_id: Optional[int] = None,
553
+ **kwargs: Any,
554
  ):
555
+ if input_ids is not None:
556
+ if inputs_embeds is not None:
557
+ raise ValueError(
558
+ "Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
559
+ )
560
+
561
+ if input_values is not None:
562
+ input_values = input_values.to(self.device)
563
+ audio_encoder_hidden_states = self._forward_audio_encoder(
564
+ input_values, audio_length=audio_length
565
+ )
566
+ else:
567
+ batch, _ = input_ids.shape
568
+ input_values = torch.zeros(
569
+ batch,
570
+ 0,
571
+ self.audio_encoder.embed_dim,
572
+ device=input_ids.device,
573
+ )
574
+
575
+ input_ids = input_ids.to(self.device)
576
+ inputs_embeds = self._prepare_with_input_ids(
577
+ input_ids=input_ids,
578
+ audio_embeddings=audio_encoder_hidden_states,
579
+ audio_token_id=audio_token_id,
580
+ )
581
+ else:
582
+ if inputs_embeds is None:
583
+ raise ValueError(
584
+ "Either `input_ids` or `inputs_embeds` must be passed."
585
+ )
586
+ if input_values is not None:
587
+ raise ValueError(
588
+ "Cannot pass `input_values` when `inputs_embeds` is provided."
589
+ )
590
+
591
+ return self.decoder(
592
  input_ids=None,
593
+ inputs_embeds=inputs_embeds,
594
+ attention_mask=attention_mask,
595
+ **kwargs,
 
596
  )
597
 
 
 
 
 
598
  def generate(
599
  self,
600
+ input_ids: Optional[Tensor] = None,
601
+ input_values: Optional[Tensor] = None,
602
+ inputs_embeds: Optional[Tensor] = None,
603
+ audio_length: Optional[Iterable[int]] = None,
604
+ audio_token_id: Optional[int] = None,
605
+ **kwargs,
 
 
 
 
 
 
 
606
  ):
607
+ if input_ids is not None:
608
+ if inputs_embeds is not None:
609
+ raise ValueError(
610
+ "Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
611
+ )
612
+
613
+ if input_values is not None:
614
+ input_values = input_values.to(self.device)
615
+ audio_encoder_hidden_states = self._forward_audio_encoder(
616
+ input_values, audio_length=audio_length
617
+ )
618
+ else:
619
+ batch, _ = input_ids.shape
620
+ input_values = torch.zeros(
621
+ batch,
622
+ 0,
623
+ self.audio_encoder.embed_dim,
624
+ device=input_ids.device,
625
+ )
626
+
627
+ input_ids = input_ids.to(self.device)
628
+ inputs_embeds = self._prepare_with_input_ids(
629
+ input_ids=input_ids,
630
+ audio_embeddings=audio_encoder_hidden_states,
631
+ audio_token_id=audio_token_id,
632
+ )
633
+ else:
634
+ if inputs_embeds is None:
635
+ raise ValueError(
636
+ "Either `input_ids` or `inputs_embeds` must be passed."
637
+ )
638
+ if input_values is not None:
639
+ raise ValueError(
640
+ "Cannot pass `input_values` when `inputs_embeds` is provided."
641
+ )
642
+
643
+ return self.decoder.generate(
644
+ inputs_embeds=inputs_embeds,
645
+ generation_config=kwargs.pop("generation_config", self.generation_config),
646
+ **kwargs,
647
  )