zhoukz commited on
Commit
4041525
·
1 Parent(s): 89cff4d

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -48,7 +48,6 @@ base_model:
48
  ... "<|im_start|>assistant\\n'"
49
  ... ]
50
 
51
-
52
  >>> import torch
53
  >>> with torch.no_grad():
54
  ... model_inputs = processor(text=text, audio=audio, sampling_rate=sr))
 
48
  ... "<|im_start|>assistant\\n'"
49
  ... ]
50
 
 
51
  >>> import torch
52
  >>> with torch.no_grad():
53
  ... model_inputs = processor(text=text, audio=audio, sampling_rate=sr))
config.json CHANGED
@@ -1,12 +1,11 @@
1
  {
2
  "architectures": [
3
- "DashengQwen25OmniModelInstruct"
4
  ],
5
  "audio_encoder_config": {
6
  "attn_drop_rate": 0.0,
7
  "center": true,
8
  "depth": 32,
9
- "drop_path_rate": 0.0,
10
  "drop_rate": 0.0,
11
  "embed_dim": 1280,
12
  "f_max": 8000.0,
@@ -34,8 +33,8 @@
34
  "win_length": 512
35
  },
36
  "auto_map": {
37
- "AutoConfig": "configuration_midashenglm.MiAudioLLMHFConfig",
38
- "AutoModelForCausalLM": "modeling_midashenglm.DashengQwen25OmniModelInstruct"
39
  },
40
  "model_type": "miaudiollm",
41
  "resize_tokenizer": false,
 
1
  {
2
  "architectures": [
3
+ "MiDashengLMModel"
4
  ],
5
  "audio_encoder_config": {
6
  "attn_drop_rate": 0.0,
7
  "center": true,
8
  "depth": 32,
 
9
  "drop_rate": 0.0,
10
  "embed_dim": 1280,
11
  "f_max": 8000.0,
 
33
  "win_length": 512
34
  },
35
  "auto_map": {
36
+ "AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
37
+ "AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
38
  },
39
  "model_type": "miaudiollm",
40
  "resize_tokenizer": false,
configuration_midashenglm.py CHANGED
@@ -25,7 +25,6 @@ class DashengConfig(PretrainedConfig):
25
  init_values: float | None = None,
26
  drop_rate: float = 0.0,
27
  attn_drop_rate: float = 0.0,
28
- drop_path_rate: float = 0.0,
29
  f_min: float = 0.0,
30
  f_max: float = 8000.0,
31
  center: bool = True,
@@ -49,7 +48,6 @@ class DashengConfig(PretrainedConfig):
49
  self.init_values = init_values
50
  self.drop_rate = drop_rate
51
  self.attn_drop_rate = attn_drop_rate
52
- self.drop_path_rate = drop_path_rate
53
  self.f_min = f_min
54
  self.f_max = f_max
55
  self.center = center
@@ -61,7 +59,7 @@ class DashengConfig(PretrainedConfig):
61
  super().__init__(**kwargs)
62
 
63
 
64
- class MiAudioLLMHFConfig(PretrainedConfig):
65
  model_type = "miaudiollm"
66
 
67
  def __init__(
 
25
  init_values: float | None = None,
26
  drop_rate: float = 0.0,
27
  attn_drop_rate: float = 0.0,
 
28
  f_min: float = 0.0,
29
  f_max: float = 8000.0,
30
  center: bool = True,
 
48
  self.init_values = init_values
49
  self.drop_rate = drop_rate
50
  self.attn_drop_rate = attn_drop_rate
 
51
  self.f_min = f_min
52
  self.f_max = f_max
53
  self.center = center
 
59
  super().__init__(**kwargs)
60
 
61
 
62
+ class MiDashengLMConfig(PretrainedConfig):
63
  model_type = "miaudiollm"
64
 
65
  def __init__(
modeling_midashenglm.py CHANGED
@@ -16,7 +16,7 @@ from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
16
  Qwen2_5OmniThinkerTextModel,
17
  )
18
 
19
- from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
20
 
21
 
22
  def to_2tuple(x: Any) -> Tuple[Any, Any]:
@@ -72,7 +72,7 @@ class LayerScale(nn.Module):
72
  return x.mul_(self.gamma) if self.inplace else x * self.gamma
73
 
74
 
75
- class Mlp(nn.Module):
76
  def __init__(
77
  self,
78
  in_features: int,
@@ -98,7 +98,7 @@ class Mlp(nn.Module):
98
  return x
99
 
100
 
101
- class Attention(nn.Module):
102
  def __init__(
103
  self,
104
  dim: int,
@@ -163,7 +163,7 @@ class Attention(nn.Module):
163
  return x
164
 
165
 
166
- class Block(nn.Module):
167
  def __init__(
168
  self,
169
  dim: int,
@@ -173,10 +173,9 @@ class Block(nn.Module):
173
  drop: float = 0.0,
174
  attn_drop: float = 0.0,
175
  init_values: float | None = None,
176
- drop_path: float = 0.0,
177
  act_layer: Type[nn.Module] = nn.GELU,
178
  norm_layer: Type[nn.Module] = nn.LayerNorm,
179
- attention_type: Type[nn.Module] = Attention,
180
  ):
181
  super().__init__()
182
  self.norm1 = norm_layer(dim)
@@ -192,7 +191,7 @@ class Block(nn.Module):
192
  )
193
 
194
  self.norm2 = norm_layer(dim)
195
- self.mlp = Mlp(
196
  in_features=dim,
197
  hidden_features=int(dim * mlp_ratio),
198
  act_layer=act_layer,
@@ -209,7 +208,7 @@ class Block(nn.Module):
209
  return x
210
 
211
 
212
- class AudioTransformer(PreTrainedModel):
213
  config_class = DashengConfig
214
 
215
  def __init__(self, config: DashengConfig):
@@ -252,12 +251,9 @@ class AudioTransformer(PreTrainedModel):
252
  )
253
 
254
  norm_layer = partial(nn.LayerNorm, eps=1e-6)
255
- dpr = [
256
- x.item() for x in torch.linspace(0, config.drop_path_rate, config.depth)
257
- ] # stochastic depth decay rule
258
  self.pos_drop = nn.Dropout(p=config.drop_rate)
259
  self.blocks = nn.ModuleList(
260
- Block(
261
  dim=config.embed_dim,
262
  num_heads=config.num_heads,
263
  mlp_ratio=config.mlp_ratio,
@@ -265,7 +261,6 @@ class AudioTransformer(PreTrainedModel):
265
  init_values=config.init_values,
266
  drop=config.drop_rate,
267
  attn_drop=config.attn_drop_rate,
268
- drop_path=dpr[i],
269
  norm_layer=norm_layer,
270
  )
271
  for i in range(config.depth)
@@ -367,14 +362,14 @@ class AudioProjectorSubsample(nn.Module):
367
 
368
 
369
  @dataclass
370
- class DashengQwen25OmniModelInstructOutput(ModelOutput):
371
  logits: torch.FloatTensor = None
372
  past_key_values: Optional[List[torch.FloatTensor]] = None
373
  hidden_states: Optional[Tuple[torch.FloatTensor]] = None
374
  attentions: Optional[Tuple[torch.FloatTensor]] = None
375
 
376
 
377
- class Decoder(PreTrainedModel, GenerationMixin):
378
  config_class = Qwen2_5OmniTextConfig
379
  _supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
380
  _supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
@@ -397,7 +392,7 @@ class Decoder(PreTrainedModel, GenerationMixin):
397
  self,
398
  return_dict: Optional[bool] = None,
399
  **kwargs: Any,
400
- ) -> DashengQwen25OmniModelInstructOutput:
401
  outputs: BaseModelOutputWithPast = self.model(
402
  return_dict=True,
403
  **kwargs,
@@ -418,7 +413,7 @@ class Decoder(PreTrainedModel, GenerationMixin):
418
  if v is not None
419
  )
420
 
421
- return DashengQwen25OmniModelInstructOutput(
422
  logits=logits,
423
  past_key_values=outputs.past_key_values,
424
  hidden_states=outputs.hidden_states,
@@ -426,8 +421,8 @@ class Decoder(PreTrainedModel, GenerationMixin):
426
  )
427
 
428
 
429
- class DashengQwen25OmniModelInstruct(PreTrainedModel):
430
- config_class = MiAudioLLMHFConfig
431
  _supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
432
  _supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
433
  _supports_flex_attn = Qwen2_5OmniThinkerTextModel._supports_flex_attn
@@ -435,16 +430,18 @@ class DashengQwen25OmniModelInstruct(PreTrainedModel):
435
  _supports_static_cache = Qwen2_5OmniThinkerTextModel._supports_static_cache
436
  _supports_quantized_cache = Qwen2_5OmniThinkerTextModel._supports_quantized_cache
437
 
438
- def __init__(self, config: MiAudioLLMHFConfig):
439
  super().__init__(config)
440
 
441
- self.audio_encoder = AudioTransformer._from_config(config.audio_encoder_config)
 
 
442
  self.audio_projector = AudioProjectorSubsample(
443
  self.audio_encoder.embed_dim,
444
  config.text_config.hidden_size,
445
  config.subsample_factor,
446
  )
447
- self.decoder = Decoder._from_config(
448
  config.text_config,
449
  attn_implementation=config._attn_implementation,
450
  )
 
16
  Qwen2_5OmniThinkerTextModel,
17
  )
18
 
19
+ from .configuration_midashenglm import DashengConfig, MiDashengLMConfig
20
 
21
 
22
  def to_2tuple(x: Any) -> Tuple[Any, Any]:
 
72
  return x.mul_(self.gamma) if self.inplace else x * self.gamma
73
 
74
 
75
+ class DashengMlp(nn.Module):
76
  def __init__(
77
  self,
78
  in_features: int,
 
98
  return x
99
 
100
 
101
+ class DashengAttention(nn.Module):
102
  def __init__(
103
  self,
104
  dim: int,
 
163
  return x
164
 
165
 
166
+ class DashengBlock(nn.Module):
167
  def __init__(
168
  self,
169
  dim: int,
 
173
  drop: float = 0.0,
174
  attn_drop: float = 0.0,
175
  init_values: float | None = None,
 
176
  act_layer: Type[nn.Module] = nn.GELU,
177
  norm_layer: Type[nn.Module] = nn.LayerNorm,
178
+ attention_type: Type[nn.Module] = DashengAttention,
179
  ):
180
  super().__init__()
181
  self.norm1 = norm_layer(dim)
 
191
  )
192
 
193
  self.norm2 = norm_layer(dim)
194
+ self.mlp = DashengMlp(
195
  in_features=dim,
196
  hidden_features=int(dim * mlp_ratio),
197
  act_layer=act_layer,
 
208
  return x
209
 
210
 
211
+ class DashengAudioTransformer(PreTrainedModel):
212
  config_class = DashengConfig
213
 
214
  def __init__(self, config: DashengConfig):
 
251
  )
252
 
253
  norm_layer = partial(nn.LayerNorm, eps=1e-6)
 
 
 
254
  self.pos_drop = nn.Dropout(p=config.drop_rate)
255
  self.blocks = nn.ModuleList(
256
+ DashengBlock(
257
  dim=config.embed_dim,
258
  num_heads=config.num_heads,
259
  mlp_ratio=config.mlp_ratio,
 
261
  init_values=config.init_values,
262
  drop=config.drop_rate,
263
  attn_drop=config.attn_drop_rate,
 
264
  norm_layer=norm_layer,
265
  )
266
  for i in range(config.depth)
 
362
 
363
 
364
  @dataclass
365
+ class Qwen25OmniTextModelOutput(ModelOutput):
366
  logits: torch.FloatTensor = None
367
  past_key_values: Optional[List[torch.FloatTensor]] = None
368
  hidden_states: Optional[Tuple[torch.FloatTensor]] = None
369
  attentions: Optional[Tuple[torch.FloatTensor]] = None
370
 
371
 
372
+ class Qwen25OmniThinkerTextOnlyDecoder(PreTrainedModel, GenerationMixin):
373
  config_class = Qwen2_5OmniTextConfig
374
  _supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
375
  _supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
 
392
  self,
393
  return_dict: Optional[bool] = None,
394
  **kwargs: Any,
395
+ ) -> Qwen25OmniTextModelOutput:
396
  outputs: BaseModelOutputWithPast = self.model(
397
  return_dict=True,
398
  **kwargs,
 
413
  if v is not None
414
  )
415
 
416
+ return Qwen25OmniTextModelOutput(
417
  logits=logits,
418
  past_key_values=outputs.past_key_values,
419
  hidden_states=outputs.hidden_states,
 
421
  )
422
 
423
 
424
+ class MiDashengLMModel(PreTrainedModel):
425
+ config_class = MiDashengLMConfig
426
  _supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
427
  _supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
428
  _supports_flex_attn = Qwen2_5OmniThinkerTextModel._supports_flex_attn
 
430
  _supports_static_cache = Qwen2_5OmniThinkerTextModel._supports_static_cache
431
  _supports_quantized_cache = Qwen2_5OmniThinkerTextModel._supports_quantized_cache
432
 
433
+ def __init__(self, config: MiDashengLMConfig):
434
  super().__init__(config)
435
 
436
+ self.audio_encoder = DashengAudioTransformer._from_config(
437
+ config.audio_encoder_config
438
+ )
439
  self.audio_projector = AudioProjectorSubsample(
440
  self.audio_encoder.embed_dim,
441
  config.text_config.hidden_size,
442
  config.subsample_factor,
443
  )
444
+ self.decoder = Qwen25OmniThinkerTextOnlyDecoder._from_config(
445
  config.text_config,
446
  attn_implementation=config._attn_implementation,
447
  )