giuseppe-tanzi commited on
Commit
d848202
·
verified ·
1 Parent(s): 81a7527

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +6 -6
  2. __init__.py +6 -0
  3. example_usage.py +1 -0
  4. modeling_seamless_basic.py +111 -0
README.md CHANGED
@@ -64,7 +64,7 @@ import torch
64
  import numpy as np
65
  import importlib.util
66
 
67
- # Load model
68
  model = AutoModel.from_pretrained("videoloc/seamless-basic")
69
  config = AutoConfig.from_pretrained("videoloc/seamless-basic")
70
 
@@ -75,11 +75,11 @@ collator_module = importlib.util.module_from_spec(spec)
75
  spec.loader.exec_module(collator_module)
76
 
77
  # Initialize data collator
78
- data_collator = collator_module.DataCollatorSimpleSeamless(
79
- processor="facebook/hf-seamless-m4t-medium",
80
- max_audio_length_sec=8.0,
81
- max_text_length=256
82
- )
83
 
84
  # Prepare your data
85
  your_data = [
 
64
  import numpy as np
65
  import importlib.util
66
 
67
+ # Load model - architecture is included in the repository
68
  model = AutoModel.from_pretrained("videoloc/seamless-basic")
69
  config = AutoConfig.from_pretrained("videoloc/seamless-basic")
70
 
 
75
  spec.loader.exec_module(collator_module)
76
 
77
  # Initialize data collator
78
+ data_collator = collator_module.DataCollatorSimpleSeamless(
79
+ processor="facebook/hf-seamless-m4t-medium",
80
+ max_audio_length_sec=8.0,
81
+ max_text_length=256
82
+ )
83
 
84
  # Prepare your data
85
  your_data = [
__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ SeamlessBasic model for HuggingFace Transformers
3
+ """
4
+ from .modeling_seamless_basic import HFSeamlessBasic, SeamlessBasicConfig
5
+
6
+ __all__ = ["HFSeamlessBasic", "SeamlessBasicConfig"]
example_usage.py CHANGED
@@ -8,6 +8,7 @@ import numpy as np
8
  import importlib.util
9
 
10
  def load_model_and_collator():
 
11
  model = AutoModel.from_pretrained("videoloc/seamless-basic")
12
  config = AutoConfig.from_pretrained("videoloc/seamless-basic")
13
 
 
8
  import importlib.util
9
 
10
  def load_model_and_collator():
11
+ # Load model - architecture is included in the repository
12
  model = AutoModel.from_pretrained("videoloc/seamless-basic")
13
  config = AutoConfig.from_pretrained("videoloc/seamless-basic")
14
 
modeling_seamless_basic.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import PreTrainedModel, PretrainedConfig
5
+ from transformers.modeling_outputs import SequenceClassifierOutput
6
+ from transformers import SeamlessM4TModel
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class SeamlessBasicConfig(PretrainedConfig):
13
+ """Configuration class for SeamlessBasic model."""
14
+
15
+ model_type = "seamless_basic"
16
+
17
+ def __init__(
18
+ self,
19
+ seamless_model_name="facebook/hf-seamless-m4t-medium",
20
+ hidden_size=1024,
21
+ dropout_prob=0.1,
22
+ **kwargs
23
+ ):
24
+ super().__init__(**kwargs)
25
+ self.seamless_model_name = seamless_model_name
26
+ self.hidden_size = hidden_size
27
+ self.dropout_prob = dropout_prob
28
+
29
+
30
+ class HFSeamlessBasic(PreTrainedModel):
31
+ """Basic SeamlessM4T model for HuggingFace Hub - processes audio and text only."""
32
+
33
+ config_class = SeamlessBasicConfig
34
+ supports_gradient_checkpointing = True
35
+
36
+ def __init__(self, config):
37
+ super().__init__(config)
38
+ self.config = config
39
+
40
+ # Load the underlying SeamlessM4T model
41
+ self.seamless_model = SeamlessM4TModel.from_pretrained(config.seamless_model_name)
42
+ self.seamless_model_speech_encoder = self.seamless_model.speech_encoder
43
+ self.seamless_model_text_encoder = self.seamless_model.text_encoder
44
+
45
+ # Freeze pre-trained models
46
+ for param in self.seamless_model_speech_encoder.parameters():
47
+ param.requires_grad = False
48
+ for param in self.seamless_model_text_encoder.parameters():
49
+ param.requires_grad = False
50
+
51
+ # Projection layers
52
+ self.audio_proj = nn.Linear(
53
+ self.seamless_model_speech_encoder.config.hidden_size,
54
+ config.hidden_size
55
+ )
56
+ self.text_proj = nn.Linear(
57
+ self.seamless_model_text_encoder.config.hidden_size,
58
+ config.hidden_size
59
+ )
60
+
61
+ # Classification head (2048 = 1024 + 1024)
62
+ self.fc = nn.Sequential(
63
+ nn.Linear(2048, 1024),
64
+ nn.ReLU(),
65
+ nn.Dropout(config.dropout_prob),
66
+ nn.Linear(1024, 512),
67
+ nn.ReLU(),
68
+ nn.Dropout(config.dropout_prob),
69
+ nn.Linear(512, 256),
70
+ nn.ReLU(),
71
+ nn.Dropout(config.dropout_prob),
72
+ nn.Linear(256, 1)
73
+ )
74
+
75
+ def forward(
76
+ self,
77
+ input_features,
78
+ input_ids,
79
+ text_attention_mask,
80
+ audio_attention_mask=None,
81
+ labels=None,
82
+ **kwargs # Accept additional features but ignore them
83
+ ):
84
+ # Encode audio
85
+ audio_emb = self.seamless_model_speech_encoder(
86
+ input_features=input_features,
87
+ attention_mask=audio_attention_mask
88
+ ).last_hidden_state.mean(dim=1)
89
+ audio_emb = self.audio_proj(audio_emb)
90
+
91
+ # Encode text
92
+ text_emb = self.seamless_model_text_encoder(
93
+ input_ids=input_ids,
94
+ attention_mask=text_attention_mask
95
+ ).last_hidden_state.mean(dim=1)
96
+ text_emb = self.text_proj(text_emb)
97
+
98
+ # Combine features
99
+ combined = torch.cat([audio_emb, text_emb], dim=1) # (batch_size, 2048)
100
+
101
+ logits = self.fc(combined).squeeze(-1)
102
+
103
+ # Compute loss if labels are provided
104
+ loss = F.mse_loss(logits, labels) if labels is not None else None
105
+
106
+ return SequenceClassifierOutput(
107
+ loss=loss,
108
+ logits=logits,
109
+ hidden_states=None,
110
+ attentions=None
111
+ )