Upload folder using huggingface_hub
Browse files- README.md +0 -1
- config.json +3 -4
- configuration_midashenglm.py +1 -3
- modeling_midashenglm.py +19 -22
README.md
CHANGED
|
@@ -48,7 +48,6 @@ base_model:
|
|
| 48 |
... "<|im_start|>assistant\\n'"
|
| 49 |
... ]
|
| 50 |
|
| 51 |
-
|
| 52 |
>>> import torch
|
| 53 |
>>> with torch.no_grad():
|
| 54 |
... model_inputs = processor(text=text, audio=audio, sampling_rate=sr))
|
|
|
|
| 48 |
... "<|im_start|>assistant\\n'"
|
| 49 |
... ]
|
| 50 |
|
|
|
|
| 51 |
>>> import torch
|
| 52 |
>>> with torch.no_grad():
|
| 53 |
... model_inputs = processor(text=text, audio=audio, sampling_rate=sr))
|
config.json
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"audio_encoder_config": {
|
| 6 |
"attn_drop_rate": 0.0,
|
| 7 |
"center": true,
|
| 8 |
"depth": 32,
|
| 9 |
-
"drop_path_rate": 0.0,
|
| 10 |
"drop_rate": 0.0,
|
| 11 |
"embed_dim": 1280,
|
| 12 |
"f_max": 8000.0,
|
|
@@ -34,8 +33,8 @@
|
|
| 34 |
"win_length": 512
|
| 35 |
},
|
| 36 |
"auto_map": {
|
| 37 |
-
"AutoConfig": "configuration_midashenglm.
|
| 38 |
-
"AutoModelForCausalLM": "modeling_midashenglm.
|
| 39 |
},
|
| 40 |
"model_type": "miaudiollm",
|
| 41 |
"resize_tokenizer": false,
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"MiDashengLMModel"
|
| 4 |
],
|
| 5 |
"audio_encoder_config": {
|
| 6 |
"attn_drop_rate": 0.0,
|
| 7 |
"center": true,
|
| 8 |
"depth": 32,
|
|
|
|
| 9 |
"drop_rate": 0.0,
|
| 10 |
"embed_dim": 1280,
|
| 11 |
"f_max": 8000.0,
|
|
|
|
| 33 |
"win_length": 512
|
| 34 |
},
|
| 35 |
"auto_map": {
|
| 36 |
+
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
| 37 |
+
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
| 38 |
},
|
| 39 |
"model_type": "miaudiollm",
|
| 40 |
"resize_tokenizer": false,
|
configuration_midashenglm.py
CHANGED
|
@@ -25,7 +25,6 @@ class DashengConfig(PretrainedConfig):
|
|
| 25 |
init_values: float | None = None,
|
| 26 |
drop_rate: float = 0.0,
|
| 27 |
attn_drop_rate: float = 0.0,
|
| 28 |
-
drop_path_rate: float = 0.0,
|
| 29 |
f_min: float = 0.0,
|
| 30 |
f_max: float = 8000.0,
|
| 31 |
center: bool = True,
|
|
@@ -49,7 +48,6 @@ class DashengConfig(PretrainedConfig):
|
|
| 49 |
self.init_values = init_values
|
| 50 |
self.drop_rate = drop_rate
|
| 51 |
self.attn_drop_rate = attn_drop_rate
|
| 52 |
-
self.drop_path_rate = drop_path_rate
|
| 53 |
self.f_min = f_min
|
| 54 |
self.f_max = f_max
|
| 55 |
self.center = center
|
|
@@ -61,7 +59,7 @@ class DashengConfig(PretrainedConfig):
|
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
|
| 64 |
-
class
|
| 65 |
model_type = "miaudiollm"
|
| 66 |
|
| 67 |
def __init__(
|
|
|
|
| 25 |
init_values: float | None = None,
|
| 26 |
drop_rate: float = 0.0,
|
| 27 |
attn_drop_rate: float = 0.0,
|
|
|
|
| 28 |
f_min: float = 0.0,
|
| 29 |
f_max: float = 8000.0,
|
| 30 |
center: bool = True,
|
|
|
|
| 48 |
self.init_values = init_values
|
| 49 |
self.drop_rate = drop_rate
|
| 50 |
self.attn_drop_rate = attn_drop_rate
|
|
|
|
| 51 |
self.f_min = f_min
|
| 52 |
self.f_max = f_max
|
| 53 |
self.center = center
|
|
|
|
| 59 |
super().__init__(**kwargs)
|
| 60 |
|
| 61 |
|
| 62 |
+
class MiDashengLMConfig(PretrainedConfig):
|
| 63 |
model_type = "miaudiollm"
|
| 64 |
|
| 65 |
def __init__(
|
modeling_midashenglm.py
CHANGED
|
@@ -16,7 +16,7 @@ from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
|
|
| 16 |
Qwen2_5OmniThinkerTextModel,
|
| 17 |
)
|
| 18 |
|
| 19 |
-
from .configuration_midashenglm import DashengConfig,
|
| 20 |
|
| 21 |
|
| 22 |
def to_2tuple(x: Any) -> Tuple[Any, Any]:
|
|
@@ -72,7 +72,7 @@ class LayerScale(nn.Module):
|
|
| 72 |
return x.mul_(self.gamma) if self.inplace else x * self.gamma
|
| 73 |
|
| 74 |
|
| 75 |
-
class
|
| 76 |
def __init__(
|
| 77 |
self,
|
| 78 |
in_features: int,
|
|
@@ -98,7 +98,7 @@ class Mlp(nn.Module):
|
|
| 98 |
return x
|
| 99 |
|
| 100 |
|
| 101 |
-
class
|
| 102 |
def __init__(
|
| 103 |
self,
|
| 104 |
dim: int,
|
|
@@ -163,7 +163,7 @@ class Attention(nn.Module):
|
|
| 163 |
return x
|
| 164 |
|
| 165 |
|
| 166 |
-
class
|
| 167 |
def __init__(
|
| 168 |
self,
|
| 169 |
dim: int,
|
|
@@ -173,10 +173,9 @@ class Block(nn.Module):
|
|
| 173 |
drop: float = 0.0,
|
| 174 |
attn_drop: float = 0.0,
|
| 175 |
init_values: float | None = None,
|
| 176 |
-
drop_path: float = 0.0,
|
| 177 |
act_layer: Type[nn.Module] = nn.GELU,
|
| 178 |
norm_layer: Type[nn.Module] = nn.LayerNorm,
|
| 179 |
-
attention_type: Type[nn.Module] =
|
| 180 |
):
|
| 181 |
super().__init__()
|
| 182 |
self.norm1 = norm_layer(dim)
|
|
@@ -192,7 +191,7 @@ class Block(nn.Module):
|
|
| 192 |
)
|
| 193 |
|
| 194 |
self.norm2 = norm_layer(dim)
|
| 195 |
-
self.mlp =
|
| 196 |
in_features=dim,
|
| 197 |
hidden_features=int(dim * mlp_ratio),
|
| 198 |
act_layer=act_layer,
|
|
@@ -209,7 +208,7 @@ class Block(nn.Module):
|
|
| 209 |
return x
|
| 210 |
|
| 211 |
|
| 212 |
-
class
|
| 213 |
config_class = DashengConfig
|
| 214 |
|
| 215 |
def __init__(self, config: DashengConfig):
|
|
@@ -252,12 +251,9 @@ class AudioTransformer(PreTrainedModel):
|
|
| 252 |
)
|
| 253 |
|
| 254 |
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
| 255 |
-
dpr = [
|
| 256 |
-
x.item() for x in torch.linspace(0, config.drop_path_rate, config.depth)
|
| 257 |
-
] # stochastic depth decay rule
|
| 258 |
self.pos_drop = nn.Dropout(p=config.drop_rate)
|
| 259 |
self.blocks = nn.ModuleList(
|
| 260 |
-
|
| 261 |
dim=config.embed_dim,
|
| 262 |
num_heads=config.num_heads,
|
| 263 |
mlp_ratio=config.mlp_ratio,
|
|
@@ -265,7 +261,6 @@ class AudioTransformer(PreTrainedModel):
|
|
| 265 |
init_values=config.init_values,
|
| 266 |
drop=config.drop_rate,
|
| 267 |
attn_drop=config.attn_drop_rate,
|
| 268 |
-
drop_path=dpr[i],
|
| 269 |
norm_layer=norm_layer,
|
| 270 |
)
|
| 271 |
for i in range(config.depth)
|
|
@@ -367,14 +362,14 @@ class AudioProjectorSubsample(nn.Module):
|
|
| 367 |
|
| 368 |
|
| 369 |
@dataclass
|
| 370 |
-
class
|
| 371 |
logits: torch.FloatTensor = None
|
| 372 |
past_key_values: Optional[List[torch.FloatTensor]] = None
|
| 373 |
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
| 374 |
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 375 |
|
| 376 |
|
| 377 |
-
class
|
| 378 |
config_class = Qwen2_5OmniTextConfig
|
| 379 |
_supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
|
| 380 |
_supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
|
|
@@ -397,7 +392,7 @@ class Decoder(PreTrainedModel, GenerationMixin):
|
|
| 397 |
self,
|
| 398 |
return_dict: Optional[bool] = None,
|
| 399 |
**kwargs: Any,
|
| 400 |
-
) ->
|
| 401 |
outputs: BaseModelOutputWithPast = self.model(
|
| 402 |
return_dict=True,
|
| 403 |
**kwargs,
|
|
@@ -418,7 +413,7 @@ class Decoder(PreTrainedModel, GenerationMixin):
|
|
| 418 |
if v is not None
|
| 419 |
)
|
| 420 |
|
| 421 |
-
return
|
| 422 |
logits=logits,
|
| 423 |
past_key_values=outputs.past_key_values,
|
| 424 |
hidden_states=outputs.hidden_states,
|
|
@@ -426,8 +421,8 @@ class Decoder(PreTrainedModel, GenerationMixin):
|
|
| 426 |
)
|
| 427 |
|
| 428 |
|
| 429 |
-
class
|
| 430 |
-
config_class =
|
| 431 |
_supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
|
| 432 |
_supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
|
| 433 |
_supports_flex_attn = Qwen2_5OmniThinkerTextModel._supports_flex_attn
|
|
@@ -435,16 +430,18 @@ class DashengQwen25OmniModelInstruct(PreTrainedModel):
|
|
| 435 |
_supports_static_cache = Qwen2_5OmniThinkerTextModel._supports_static_cache
|
| 436 |
_supports_quantized_cache = Qwen2_5OmniThinkerTextModel._supports_quantized_cache
|
| 437 |
|
| 438 |
-
def __init__(self, config:
|
| 439 |
super().__init__(config)
|
| 440 |
|
| 441 |
-
self.audio_encoder =
|
|
|
|
|
|
|
| 442 |
self.audio_projector = AudioProjectorSubsample(
|
| 443 |
self.audio_encoder.embed_dim,
|
| 444 |
config.text_config.hidden_size,
|
| 445 |
config.subsample_factor,
|
| 446 |
)
|
| 447 |
-
self.decoder =
|
| 448 |
config.text_config,
|
| 449 |
attn_implementation=config._attn_implementation,
|
| 450 |
)
|
|
|
|
| 16 |
Qwen2_5OmniThinkerTextModel,
|
| 17 |
)
|
| 18 |
|
| 19 |
+
from .configuration_midashenglm import DashengConfig, MiDashengLMConfig
|
| 20 |
|
| 21 |
|
| 22 |
def to_2tuple(x: Any) -> Tuple[Any, Any]:
|
|
|
|
| 72 |
return x.mul_(self.gamma) if self.inplace else x * self.gamma
|
| 73 |
|
| 74 |
|
| 75 |
+
class DashengMlp(nn.Module):
|
| 76 |
def __init__(
|
| 77 |
self,
|
| 78 |
in_features: int,
|
|
|
|
| 98 |
return x
|
| 99 |
|
| 100 |
|
| 101 |
+
class DashengAttention(nn.Module):
|
| 102 |
def __init__(
|
| 103 |
self,
|
| 104 |
dim: int,
|
|
|
|
| 163 |
return x
|
| 164 |
|
| 165 |
|
| 166 |
+
class DashengBlock(nn.Module):
|
| 167 |
def __init__(
|
| 168 |
self,
|
| 169 |
dim: int,
|
|
|
|
| 173 |
drop: float = 0.0,
|
| 174 |
attn_drop: float = 0.0,
|
| 175 |
init_values: float | None = None,
|
|
|
|
| 176 |
act_layer: Type[nn.Module] = nn.GELU,
|
| 177 |
norm_layer: Type[nn.Module] = nn.LayerNorm,
|
| 178 |
+
attention_type: Type[nn.Module] = DashengAttention,
|
| 179 |
):
|
| 180 |
super().__init__()
|
| 181 |
self.norm1 = norm_layer(dim)
|
|
|
|
| 191 |
)
|
| 192 |
|
| 193 |
self.norm2 = norm_layer(dim)
|
| 194 |
+
self.mlp = DashengMlp(
|
| 195 |
in_features=dim,
|
| 196 |
hidden_features=int(dim * mlp_ratio),
|
| 197 |
act_layer=act_layer,
|
|
|
|
| 208 |
return x
|
| 209 |
|
| 210 |
|
| 211 |
+
class DashengAudioTransformer(PreTrainedModel):
|
| 212 |
config_class = DashengConfig
|
| 213 |
|
| 214 |
def __init__(self, config: DashengConfig):
|
|
|
|
| 251 |
)
|
| 252 |
|
| 253 |
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
|
|
|
|
|
|
|
|
|
| 254 |
self.pos_drop = nn.Dropout(p=config.drop_rate)
|
| 255 |
self.blocks = nn.ModuleList(
|
| 256 |
+
DashengBlock(
|
| 257 |
dim=config.embed_dim,
|
| 258 |
num_heads=config.num_heads,
|
| 259 |
mlp_ratio=config.mlp_ratio,
|
|
|
|
| 261 |
init_values=config.init_values,
|
| 262 |
drop=config.drop_rate,
|
| 263 |
attn_drop=config.attn_drop_rate,
|
|
|
|
| 264 |
norm_layer=norm_layer,
|
| 265 |
)
|
| 266 |
for i in range(config.depth)
|
|
|
|
| 362 |
|
| 363 |
|
| 364 |
@dataclass
|
| 365 |
+
class Qwen25OmniTextModelOutput(ModelOutput):
|
| 366 |
logits: torch.FloatTensor = None
|
| 367 |
past_key_values: Optional[List[torch.FloatTensor]] = None
|
| 368 |
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
| 369 |
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 370 |
|
| 371 |
|
| 372 |
+
class Qwen25OmniThinkerTextOnlyDecoder(PreTrainedModel, GenerationMixin):
|
| 373 |
config_class = Qwen2_5OmniTextConfig
|
| 374 |
_supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
|
| 375 |
_supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
|
|
|
|
| 392 |
self,
|
| 393 |
return_dict: Optional[bool] = None,
|
| 394 |
**kwargs: Any,
|
| 395 |
+
) -> Qwen25OmniTextModelOutput:
|
| 396 |
outputs: BaseModelOutputWithPast = self.model(
|
| 397 |
return_dict=True,
|
| 398 |
**kwargs,
|
|
|
|
| 413 |
if v is not None
|
| 414 |
)
|
| 415 |
|
| 416 |
+
return Qwen25OmniTextModelOutput(
|
| 417 |
logits=logits,
|
| 418 |
past_key_values=outputs.past_key_values,
|
| 419 |
hidden_states=outputs.hidden_states,
|
|
|
|
| 421 |
)
|
| 422 |
|
| 423 |
|
| 424 |
+
class MiDashengLMModel(PreTrainedModel):
|
| 425 |
+
config_class = MiDashengLMConfig
|
| 426 |
_supports_flash_attn_2 = Qwen2_5OmniThinkerTextModel._supports_flash_attn_2
|
| 427 |
_supports_sdpa = Qwen2_5OmniThinkerTextModel._supports_sdpa
|
| 428 |
_supports_flex_attn = Qwen2_5OmniThinkerTextModel._supports_flex_attn
|
|
|
|
| 430 |
_supports_static_cache = Qwen2_5OmniThinkerTextModel._supports_static_cache
|
| 431 |
_supports_quantized_cache = Qwen2_5OmniThinkerTextModel._supports_quantized_cache
|
| 432 |
|
| 433 |
+
def __init__(self, config: MiDashengLMConfig):
|
| 434 |
super().__init__(config)
|
| 435 |
|
| 436 |
+
self.audio_encoder = DashengAudioTransformer._from_config(
|
| 437 |
+
config.audio_encoder_config
|
| 438 |
+
)
|
| 439 |
self.audio_projector = AudioProjectorSubsample(
|
| 440 |
self.audio_encoder.embed_dim,
|
| 441 |
config.text_config.hidden_size,
|
| 442 |
config.subsample_factor,
|
| 443 |
)
|
| 444 |
+
self.decoder = Qwen25OmniThinkerTextOnlyDecoder._from_config(
|
| 445 |
config.text_config,
|
| 446 |
attn_implementation=config._attn_implementation,
|
| 447 |
)
|