Abhaykoul commited on
Commit
dd12bab
·
verified ·
1 Parent(s): e1db6ad

Update configuration_helpingai.py

Browse files
Files changed (1) hide show
  1. configuration_helpingai.py +302 -269
configuration_helpingai.py CHANGED
@@ -1,269 +1,302 @@
1
- """HelpingAI model configuration"""
2
-
3
- from transformers.configuration_utils import PretrainedConfig, layer_type_validation
4
- from transformers.modeling_rope_utils import rope_config_validation
5
- from transformers.utils import logging
6
-
7
-
8
- logger = logging.get_logger(__name__)
9
-
10
-
11
- class HelpingAIConfig(PretrainedConfig):
12
- r"""
13
- This is the configuration class to store the configuration of a [`HelpingAIModel`]. It is used to instantiate a
14
- HelpingAI model according to the specified arguments, defining the model architecture. Instantiating a configuration
15
- with the defaults will yield a similar configuration to that of
16
- HelpingAI-8B [HelpingAI/HelpingAI-8B](https://huggingface.co/HelpingAI/HelpingAI-8B).
17
-
18
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
19
- documentation from [`PretrainedConfig`] for more information.
20
-
21
-
22
- Args:
23
- vocab_size (`int`, *optional*, defaults to 151936):
24
- Vocabulary size of the HelpingAI model. Defines the number of different tokens that can be represented by the
25
- `inputs_ids` passed when calling [`HelpingAIModel`]
26
- hidden_size (`int`, *optional*, defaults to 4096):
27
- Dimension of the hidden representations.
28
- intermediate_size (`int`, *optional*, defaults to 22016):
29
- Dimension of the MLP representations.
30
- num_hidden_layers (`int`, *optional*, defaults to 32):
31
- Number of hidden layers in the Transformer encoder.
32
- num_attention_heads (`int`, *optional*, defaults to 32):
33
- Number of attention heads for each attention layer in the Transformer encoder.
34
- num_key_value_heads (`int`, *optional*, defaults to 32):
35
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
36
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
37
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
38
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
39
- by meanpooling all the original heads within that group. For more details, check out [this
40
- paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
41
- head_dim (`int`, *optional*, defaults to 128):
42
- The attention head dimension.
43
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
44
- The non-linear activation function (function or string) in the decoder.
45
- max_position_embeddings (`int`, *optional*, defaults to 32768):
46
- The maximum sequence length that this model might ever be used with.
47
- initializer_range (`float`, *optional*, defaults to 0.02):
48
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
49
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
50
- The epsilon used by the rms normalization layers.
51
- use_cache (`bool`, *optional*, defaults to `True`):
52
- Whether or not the model should return the last key/values attentions (not used by all models). Only
53
- relevant if `config.is_decoder=True`.
54
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
55
- Whether the model's input and output word embeddings should be tied.
56
- rope_theta (`float`, *optional*, defaults to 10000.0):
57
- The base period of the RoPE embeddings.
58
- rope_scaling (`Dict`, *optional*):
59
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
60
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
61
- accordingly.
62
- Expected contents:
63
- `rope_type` (`str`):
64
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
65
- 'llama3'], with 'default' being the original RoPE implementation.
66
- `factor` (`float`, *optional*):
67
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
68
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
69
- original maximum pre-trained length.
70
- `original_max_position_embeddings` (`int`, *optional*):
71
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
72
- pretraining.
73
- `attention_factor` (`float`, *optional*):
74
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
75
- computation. If unspecified, it defaults to value recommended by the implementation, using the
76
- `factor` field to infer the suggested value.
77
- `beta_fast` (`float`, *optional*):
78
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
79
- ramp function. If unspecified, it defaults to 32.
80
- `beta_slow` (`float`, *optional*):
81
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
82
- ramp function. If unspecified, it defaults to 1.
83
- `short_factor` (`list[float]`, *optional*):
84
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
85
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
86
- size divided by the number of attention heads divided by 2
87
- `long_factor` (`list[float]`, *optional*):
88
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
89
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
90
- size divided by the number of attention heads divided by 2
91
- `low_freq_factor` (`float`, *optional*):
92
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
93
- `high_freq_factor` (`float`, *optional*):
94
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
95
- attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
96
- Whether to use a bias in the query, key, value and output projection layers during self-attention.
97
- use_sliding_window (`bool`, *optional*, defaults to `False`):
98
- Whether to use sliding window attention.
99
- sliding_window (`int`, *optional*, defaults to 4096):
100
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
101
- max_window_layers (`int`, *optional*, defaults to 28):
102
- The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
103
- additional layer afterwards will use SWA (Sliding Window Attention).
104
- layer_types (`list`, *optional*):
105
- Attention pattern for each layer.
106
- attention_dropout (`float`, *optional*, defaults to 0.0):
107
- The dropout ratio for the attention probabilities.
108
- use_emotional_reasoning (`bool`, *optional*, defaults to `True`):
109
- Whether to enable Semantic Emotion Reasoning (SER) capabilities for emotional understanding and processing.
110
- use_perspective_threading (`bool`, *optional*, defaults to `True`):
111
- Whether to enable Perspective Emotion Threading (PET) for multi-threaded emotional reasoning.
112
- num_emotion_heads (`int`, *optional*, defaults to 4):
113
- Number of specialized attention heads dedicated to emotional processing and reasoning.
114
- num_thinking_stages (`int`, *optional*, defaults to 3):
115
- Number of thinking stages for multi-stage reasoning and reflection processing.
116
- emotion_hidden_size (`int`, *optional*, defaults to 512):
117
- Hidden size for the emotional reasoning layers and SER processing modules.
118
- perspective_threads (`int`, *optional*, defaults to 4):
119
- Number of parallel perspective threads for PET processing (relatable, supportive, motivational, analytical).
120
- thinking_depth (`int`, *optional*, defaults to 2):
121
- Depth of thinking layers for internal reasoning and reflection processes.
122
- structured_output_vocab_size (`int`, *optional*, defaults to 100):
123
- Additional vocabulary size for structured output tokens like <think>, <ser>, <pet>, etc.
124
- empathy_scaling_factor (`float`, *optional*, defaults to 1.2):
125
- Scaling factor for empathy-related attention weights and emotional processing.
126
- reasoning_temperature (`float`, *optional*, defaults to 0.8):
127
- Temperature parameter for reasoning and thinking processes to balance creativity and coherence.
128
-
129
- ```python
130
- >>> from transformers import HelpingAIModel, HelpingAIConfig
131
-
132
- >>> # Initializing a HelpingAI style configuration with advanced reasoning
133
- >>> configuration = HelpingAIConfig(
134
- ... use_emotional_reasoning=True,
135
- ... use_perspective_threading=True,
136
- ... num_emotion_heads=4,
137
- ... num_thinking_stages=3
138
- ... )
139
-
140
- >>> # Initializing a model from the HelpingAI-8B style configuration
141
- >>> model = HelpingAIModel(configuration)
142
-
143
- >>> # Accessing the model configuration
144
- >>> configuration = model.config
145
- ```"""
146
-
147
- model_type = "helpingai"
148
- keys_to_ignore_at_inference = ["past_key_values"]
149
-
150
- # Default tensor parallel plan for base model `HelpingAI`
151
- base_model_tp_plan = {
152
- "layers.*.self_attn.q_proj": "colwise",
153
- "layers.*.self_attn.k_proj": "colwise",
154
- "layers.*.self_attn.v_proj": "colwise",
155
- "layers.*.self_attn.o_proj": "rowwise",
156
- "layers.*.mlp.gate_proj": "colwise",
157
- "layers.*.mlp.up_proj": "colwise",
158
- "layers.*.mlp.down_proj": "rowwise",
159
- }
160
- base_model_pp_plan = {
161
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
162
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
163
- "norm": (["hidden_states"], ["hidden_states"]),
164
- }
165
-
166
- def __init__(
167
- self,
168
- vocab_size=151936,
169
- hidden_size=4096,
170
- intermediate_size=22016,
171
- num_hidden_layers=32,
172
- num_attention_heads=32,
173
- num_key_value_heads=8, # Match num_attention_heads for compatibility
174
- head_dim=128,
175
- hidden_act="silu",
176
- max_position_embeddings=32768,
177
- initializer_range=0.02,
178
- rms_norm_eps=1e-6,
179
- use_cache=True,
180
- tie_word_embeddings=False,
181
- rope_theta=10000.0,
182
- rope_scaling=None,
183
- attention_bias=False,
184
- use_sliding_window=False,
185
- sliding_window=4096,
186
- max_window_layers=28,
187
- layer_types=None,
188
- attention_dropout=0.0,
189
- # Advanced reasoning parameters
190
- use_emotional_reasoning=False, # Disable by default for now
191
- use_perspective_threading=True,
192
- num_emotion_heads=4,
193
- num_thinking_stages=3,
194
- emotion_hidden_size=512,
195
- perspective_threads=4,
196
- thinking_depth=2,
197
- structured_output_vocab_size=100,
198
- empathy_scaling_factor=1.2,
199
- reasoning_temperature=0.8,
200
- **kwargs,
201
- ):
202
- self.vocab_size = vocab_size
203
- self.max_position_embeddings = max_position_embeddings
204
- self.hidden_size = hidden_size
205
- self.intermediate_size = intermediate_size
206
- self.num_hidden_layers = num_hidden_layers
207
- self.num_attention_heads = num_attention_heads
208
- self.use_sliding_window = use_sliding_window
209
- self.sliding_window = sliding_window if self.use_sliding_window else None
210
- self.max_window_layers = max_window_layers
211
-
212
- # for backward compatibility
213
- if num_key_value_heads is None:
214
- num_key_value_heads = num_attention_heads
215
-
216
- self.num_key_value_heads = num_key_value_heads
217
- self.head_dim = head_dim
218
- self.hidden_act = hidden_act
219
- self.initializer_range = initializer_range
220
- self.rms_norm_eps = rms_norm_eps
221
- self.use_cache = use_cache
222
- self.rope_theta = rope_theta
223
- self.rope_scaling = rope_scaling
224
- self.attention_bias = attention_bias
225
- self.attention_dropout = attention_dropout
226
-
227
- # Advanced reasoning capabilities
228
- self.use_emotional_reasoning = use_emotional_reasoning
229
- self.use_perspective_threading = use_perspective_threading
230
- self.num_emotion_heads = num_emotion_heads
231
- self.num_thinking_stages = num_thinking_stages
232
- self.emotion_hidden_size = emotion_hidden_size
233
- self.perspective_threads = perspective_threads
234
- self.thinking_depth = thinking_depth
235
- self.structured_output_vocab_size = structured_output_vocab_size
236
- self.empathy_scaling_factor = empathy_scaling_factor
237
- self.reasoning_temperature = reasoning_temperature
238
-
239
- # Validate emotional reasoning parameters
240
- if self.use_emotional_reasoning and self.num_emotion_heads > self.num_attention_heads:
241
- raise ValueError(f"num_emotion_heads ({self.num_emotion_heads}) cannot exceed num_attention_heads ({self.num_attention_heads})")
242
-
243
- if self.use_perspective_threading and self.perspective_threads < 2:
244
- raise ValueError(f"perspective_threads ({self.perspective_threads}) must be at least 2 for meaningful threading")
245
-
246
- # Validate the correctness of rotary position embeddings parameters
247
- # BC: if there is a 'type' field, move it to 'rope_type'.
248
- if self.rope_scaling is not None and "type" in self.rope_scaling:
249
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
250
- rope_config_validation(self)
251
-
252
- self.layer_types = layer_types
253
- if self.layer_types is None:
254
- self.layer_types = [
255
- "sliding_attention"
256
- if self.sliding_window is not None and i >= self.max_window_layers
257
- else "full_attention"
258
- for i in range(self.num_hidden_layers)
259
- ]
260
- layer_type_validation(self.layer_types)
261
-
262
- super().__init__(
263
- tie_word_embeddings=tie_word_embeddings,
264
- **kwargs,
265
- )
266
-
267
-
268
- __all__ = ["HelpingAIConfig"]
269
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HelpingAI model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
4
+ from transformers.modeling_rope_utils import rope_config_validation
5
+ from transformers.utils import logging
6
+
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+ class HelpingAIConfig(PretrainedConfig):
12
+ r"""
13
+ This is the configuration class to store the configuration of a [`HelpingAIModel`]. It is used to instantiate a
14
+ HelpingAI model according to the specified arguments, defining the model architecture. Instantiating a configuration
15
+ with the defaults will yield a similar configuration to that of
16
+ HelpingAI-8B [HelpingAI/HelpingAI-8B](https://huggingface.co/HelpingAI/HelpingAI-8B).
17
+
18
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
19
+ documentation from [`PretrainedConfig`] for more information.
20
+
21
+
22
+ Args:
23
+ vocab_size (`int`, *optional*, defaults to 151936):
24
+ Vocabulary size of the HelpingAI model. Defines the number of different tokens that can be represented by the
25
+ `inputs_ids` passed when calling [`HelpingAIModel`]
26
+ hidden_size (`int`, *optional*, defaults to 4096):
27
+ Dimension of the hidden representations.
28
+ intermediate_size (`int`, *optional*, defaults to 22016):
29
+ Dimension of the MLP representations.
30
+ num_hidden_layers (`int`, *optional*, defaults to 32):
31
+ Number of hidden layers in the Transformer encoder.
32
+ num_attention_heads (`int`, *optional*, defaults to 32):
33
+ Number of attention heads for each attention layer in the Transformer encoder.
34
+ num_key_value_heads (`int`, *optional*, defaults to 32):
35
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
36
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
37
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
38
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
39
+ by meanpooling all the original heads within that group. For more details, check out [this
40
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
41
+ head_dim (`int`, *optional*, defaults to 128):
42
+ The attention head dimension.
43
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
44
+ The non-linear activation function (function or string) in the decoder.
45
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
46
+ The maximum sequence length that this model might ever be used with.
47
+ initializer_range (`float`, *optional*, defaults to 0.02):
48
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
49
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
50
+ The epsilon used by the rms normalization layers.
51
+ use_cache (`bool`, *optional*, defaults to `True`):
52
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
53
+ relevant if `config.is_decoder=True`.
54
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
55
+ Whether the model's input and output word embeddings should be tied.
56
+ rope_theta (`float`, *optional*, defaults to 10000.0):
57
+ The base period of the RoPE embeddings.
58
+ rope_scaling (`Dict`, *optional*):
59
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
60
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
61
+ accordingly.
62
+ Expected contents:
63
+ `rope_type` (`str`):
64
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
65
+ 'llama3'], with 'default' being the original RoPE implementation.
66
+ `factor` (`float`, *optional*):
67
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
68
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
69
+ original maximum pre-trained length.
70
+ `original_max_position_embeddings` (`int`, *optional*):
71
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
72
+ pretraining.
73
+ `attention_factor` (`float`, *optional*):
74
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
75
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
76
+ `factor` field to infer the suggested value.
77
+ `beta_fast` (`float`, *optional*):
78
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
79
+ ramp function. If unspecified, it defaults to 32.
80
+ `beta_slow` (`float`, *optional*):
81
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
82
+ ramp function. If unspecified, it defaults to 1.
83
+ `short_factor` (`list[float]`, *optional*):
84
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
85
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
86
+ size divided by the number of attention heads divided by 2
87
+ `long_factor` (`list[float]`, *optional*):
88
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
89
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
90
+ size divided by the number of attention heads divided by 2
91
+ `low_freq_factor` (`float`, *optional*):
92
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
93
+ `high_freq_factor` (`float`, *optional*):
94
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
95
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
96
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
97
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
98
+ Whether to use sliding window attention.
99
+ sliding_window (`int`, *optional*, defaults to 4096):
100
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
101
+ max_window_layers (`int`, *optional*, defaults to 28):
102
+ The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
103
+ additional layer afterwards will use SWA (Sliding Window Attention).
104
+ layer_types (`list`, *optional*):
105
+ Attention pattern for each layer.
106
+ attention_dropout (`float`, *optional*, defaults to 0.0):
107
+ The dropout ratio for the attention probabilities.
108
+ use_emotional_reasoning (`bool`, *optional*, defaults to `True`):
109
+ Whether to enable Semantic Emotion Reasoning (SER) capabilities for emotional understanding and processing.
110
+ use_perspective_threading (`bool`, *optional*, defaults to `True`):
111
+ Whether to enable Perspective Emotion Threading (PET) for multi-threaded emotional reasoning.
112
+ num_emotion_heads (`int`, *optional*, defaults to 4):
113
+ Number of specialized attention heads dedicated to emotional processing and reasoning.
114
+ num_thinking_stages (`int`, *optional*, defaults to 3):
115
+ Number of thinking stages for multi-stage reasoning and reflection processing.
116
+ emotion_hidden_size (`int`, *optional*, defaults to 512):
117
+ Hidden size for the emotional reasoning layers and SER processing modules.
118
+ perspective_threads (`int`, *optional*, defaults to 4):
119
+ Number of parallel perspective threads for PET processing (relatable, supportive, motivational, analytical).
120
+ thinking_depth (`int`, *optional*, defaults to 2):
121
+ Depth of thinking layers for internal reasoning and reflection processes.
122
+ structured_output_vocab_size (`int`, *optional*, defaults to 100):
123
+ Additional vocabulary size for structured output tokens like <think>, <ser>, <pet>, etc.
124
+ empathy_scaling_factor (`float`, *optional*, defaults to 1.2):
125
+ Scaling factor for empathy-related attention weights and emotional processing.
126
+ reasoning_temperature (`float`, *optional*, defaults to 0.8):
127
+ Temperature parameter for reasoning and thinking processes to balance creativity and coherence.
128
+ use_speech_output (`bool`, *optional*, defaults to `False`):
129
+ Whether to enable an additional text-to-speech head that predicts mel-spectrogram frames from hidden states.
130
+ speech_num_mels (`int`, *optional*, defaults to `80`):
131
+ Number of mel bins to predict for the speech head.
132
+ speech_upsample_factor (`int`, *optional*, defaults to `1`):
133
+ Temporal upsampling factor to expand token-level hidden states to frame-level resolution by simple repetition.
134
+ speech_loss_type (`str`, *optional*, defaults to `"l1"`):
135
+ Loss for speech supervision. One of {"l1", "mse"}.
136
+ speech_head_hidden_dim (`int`, *optional*, defaults to `None`):
137
+ Hidden dimension for the speech head MLP (hidden_size -> speech_head_hidden_dim -> num_mels).
138
+ If None, defaults to hidden_size // 2. Increase to scale speech head params (e.g., ~9.6k for ~50M).
139
+
140
+ ```python
141
+ >>> from transformers import HelpingAIModel, HelpingAIConfig
142
+
143
+ >>> # Initializing a HelpingAI style configuration with advanced reasoning
144
+ >>> configuration = HelpingAIConfig(
145
+ ... use_emotional_reasoning=True,
146
+ ... use_perspective_threading=True,
147
+ ... num_emotion_heads=4,
148
+ ... num_thinking_stages=3
149
+ ... )
150
+
151
+ >>> # Initializing a model from the HelpingAI-8B style configuration
152
+ >>> model = HelpingAIModel(configuration)
153
+
154
+ >>> # Accessing the model configuration
155
+ >>> configuration = model.config
156
+ ```"""
157
+
158
+ model_type = "helpingai"
159
+ keys_to_ignore_at_inference = ["past_key_values"]
160
+
161
+ # Default tensor parallel plan for base model `HelpingAI`
162
+ base_model_tp_plan = {
163
+ "layers.*.self_attn.q_proj": "colwise",
164
+ "layers.*.self_attn.k_proj": "colwise",
165
+ "layers.*.self_attn.v_proj": "colwise",
166
+ "layers.*.self_attn.o_proj": "rowwise",
167
+ "layers.*.mlp.gate_proj": "colwise",
168
+ "layers.*.mlp.up_proj": "colwise",
169
+ "layers.*.mlp.down_proj": "rowwise",
170
+ }
171
+ base_model_pp_plan = {
172
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
173
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
174
+ "norm": (["hidden_states"], ["hidden_states"]),
175
+ }
176
+
177
+ def __init__(
178
+ self,
179
+ vocab_size=151936,
180
+ hidden_size=4096,
181
+ intermediate_size=22016,
182
+ num_hidden_layers=32,
183
+ num_attention_heads=32,
184
+ num_key_value_heads=8, # Match num_attention_heads for compatibility
185
+ head_dim=128,
186
+ hidden_act="silu",
187
+ max_position_embeddings=32768,
188
+ initializer_range=0.02,
189
+ rms_norm_eps=1e-6,
190
+ use_cache=True,
191
+ tie_word_embeddings=False,
192
+ rope_theta=10000.0,
193
+ rope_scaling=None,
194
+ attention_bias=False,
195
+ use_sliding_window=False,
196
+ sliding_window=4096,
197
+ max_window_layers=28,
198
+ layer_types=None,
199
+ attention_dropout=0.0,
200
+ # Advanced reasoning parameters
201
+ use_emotional_reasoning=False, # Disable by default for now
202
+ use_perspective_threading=True,
203
+ num_emotion_heads=4,
204
+ num_thinking_stages=3,
205
+ emotion_hidden_size=512,
206
+ perspective_threads=4,
207
+ thinking_depth=2,
208
+ structured_output_vocab_size=100,
209
+ empathy_scaling_factor=1.2,
210
+ reasoning_temperature=0.8,
211
+ # Speech output head options
212
+ use_speech_output=False,
213
+ speech_num_mels=80,
214
+ speech_upsample_factor=1,
215
+ speech_loss_type="l1",
216
+ speech_head_hidden_dim=None,
217
+ **kwargs,
218
+ ):
219
+ self.vocab_size = vocab_size
220
+ self.max_position_embeddings = max_position_embeddings
221
+ self.hidden_size = hidden_size
222
+ self.intermediate_size = intermediate_size
223
+ self.num_hidden_layers = num_hidden_layers
224
+ self.num_attention_heads = num_attention_heads
225
+ self.use_sliding_window = use_sliding_window
226
+ self.sliding_window = sliding_window if self.use_sliding_window else None
227
+ self.max_window_layers = max_window_layers
228
+
229
+ # for backward compatibility
230
+ if num_key_value_heads is None:
231
+ num_key_value_heads = num_attention_heads
232
+
233
+ self.num_key_value_heads = num_key_value_heads
234
+ self.head_dim = head_dim
235
+ self.hidden_act = hidden_act
236
+ self.initializer_range = initializer_range
237
+ self.rms_norm_eps = rms_norm_eps
238
+ self.use_cache = use_cache
239
+ self.rope_theta = rope_theta
240
+ self.rope_scaling = rope_scaling
241
+ self.attention_bias = attention_bias
242
+ self.attention_dropout = attention_dropout
243
+
244
+ # Advanced reasoning capabilities
245
+ self.use_emotional_reasoning = use_emotional_reasoning
246
+ self.use_perspective_threading = use_perspective_threading
247
+ self.num_emotion_heads = num_emotion_heads
248
+ self.num_thinking_stages = num_thinking_stages
249
+ self.emotion_hidden_size = emotion_hidden_size
250
+ self.perspective_threads = perspective_threads
251
+ self.thinking_depth = thinking_depth
252
+ self.structured_output_vocab_size = structured_output_vocab_size
253
+ self.empathy_scaling_factor = empathy_scaling_factor
254
+ self.reasoning_temperature = reasoning_temperature
255
+ # Speech head config
256
+ self.use_speech_output = use_speech_output
257
+ self.speech_num_mels = speech_num_mels
258
+ self.speech_upsample_factor = speech_upsample_factor
259
+ self.speech_loss_type = speech_loss_type
260
+ self.speech_head_hidden_dim = speech_head_hidden_dim
261
+
262
+ # Validate emotional reasoning parameters
263
+ if self.use_emotional_reasoning and self.num_emotion_heads > self.num_attention_heads:
264
+ raise ValueError(f"num_emotion_heads ({self.num_emotion_heads}) cannot exceed num_attention_heads ({self.num_attention_heads})")
265
+
266
+ if self.use_perspective_threading and self.perspective_threads < 2:
267
+ raise ValueError(f"perspective_threads ({self.perspective_threads}) must be at least 2 for meaningful threading")
268
+ if self.use_speech_output:
269
+ if not isinstance(self.speech_num_mels, int) or self.speech_num_mels <= 0:
270
+ raise ValueError("speech_num_mels must be a positive integer")
271
+ if not isinstance(self.speech_upsample_factor, int) or self.speech_upsample_factor <= 0:
272
+ raise ValueError("speech_upsample_factor must be a positive integer")
273
+ if self.speech_loss_type not in {"l1", "mse"}:
274
+ raise ValueError("speech_loss_type must be one of {'l1','mse'}")
275
+ if self.speech_head_hidden_dim is not None:
276
+ if not isinstance(self.speech_head_hidden_dim, int) or self.speech_head_hidden_dim <= 0:
277
+ raise ValueError("speech_head_hidden_dim must be a positive integer when provided")
278
+
279
+ # Validate the correctness of rotary position embeddings parameters
280
+ # BC: if there is a 'type' field, move it to 'rope_type'.
281
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
282
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
283
+ rope_config_validation(self)
284
+
285
+ self.layer_types = layer_types
286
+ if self.layer_types is None:
287
+ self.layer_types = [
288
+ "sliding_attention"
289
+ if self.sliding_window is not None and i >= self.max_window_layers
290
+ else "full_attention"
291
+ for i in range(self.num_hidden_layers)
292
+ ]
293
+ layer_type_validation(self.layer_types)
294
+
295
+ super().__init__(
296
+ tie_word_embeddings=tie_word_embeddings,
297
+ **kwargs,
298
+ )
299
+
300
+
301
+ __all__ = ["HelpingAIConfig"]
302
+