SmerkyG commited on
Commit
1161365
·
verified ·
1 Parent(s): 35648d6

Update configuration_rwkv6qwen2.py

Browse files
Files changed (1) hide show
  1. configuration_rwkv6qwen2.py +16 -0
configuration_rwkv6qwen2.py CHANGED
@@ -137,12 +137,15 @@ class RWKV6Qwen2Config(PretrainedConfig):
137
  num_hidden_layers=32,
138
  num_attention_heads=32,
139
  num_key_value_heads=32,
 
 
140
  hidden_act="silu",
141
  max_position_embeddings=32768,
142
  initializer_range=0.02,
143
  rms_norm_eps=1e-6,
144
  use_cache=True,
145
  tie_word_embeddings=False,
 
146
  rope_theta=10000.0,
147
  rope_scaling=None,
148
  use_sliding_window=False,
@@ -151,6 +154,11 @@ class RWKV6Qwen2Config(PretrainedConfig):
151
  attention_dropout=0.0,
152
  attention_bias=True,
153
  attention_output_bias=False,
 
 
 
 
 
154
  **kwargs,
155
  ):
156
  self.vocab_size = vocab_size
@@ -168,10 +176,13 @@ class RWKV6Qwen2Config(PretrainedConfig):
168
  num_key_value_heads = num_attention_heads
169
 
170
  self.num_key_value_heads = num_key_value_heads
 
 
171
  self.hidden_act = hidden_act
172
  self.initializer_range = initializer_range
173
  self.rms_norm_eps = rms_norm_eps
174
  self.use_cache = use_cache
 
175
  self.rope_theta = rope_theta
176
  self.rope_scaling = rope_scaling
177
  self.attention_dropout = attention_dropout
@@ -183,6 +194,11 @@ class RWKV6Qwen2Config(PretrainedConfig):
183
 
184
  self.attention_bias = attention_bias
185
  self.attention_output_bias = attention_output_bias
 
 
 
 
 
186
 
187
  super().__init__(
188
  tie_word_embeddings=tie_word_embeddings,
 
137
  num_hidden_layers=32,
138
  num_attention_heads=32,
139
  num_key_value_heads=32,
140
+ lora_rank_tokenshift=None,
141
+ lora_rank_decay=None,
142
  hidden_act="silu",
143
  max_position_embeddings=32768,
144
  initializer_range=0.02,
145
  rms_norm_eps=1e-6,
146
  use_cache=True,
147
  tie_word_embeddings=False,
148
+ use_rope=False,
149
  rope_theta=10000.0,
150
  rope_scaling=None,
151
  use_sliding_window=False,
 
154
  attention_dropout=0.0,
155
  attention_bias=True,
156
  attention_output_bias=False,
157
+ gate_rank_type=1,
158
+ lora_rank_gate=None,
159
+ balance_state=True,
160
+ groupnorm_att=False,
161
+ use_tokenshift=True,
162
  **kwargs,
163
  ):
164
  self.vocab_size = vocab_size
 
176
  num_key_value_heads = num_attention_heads
177
 
178
  self.num_key_value_heads = num_key_value_heads
179
+ self.lora_rank_tokenshift = lora_rank_tokenshift
180
+ self.lora_rank_decay = lora_rank_decay
181
  self.hidden_act = hidden_act
182
  self.initializer_range = initializer_range
183
  self.rms_norm_eps = rms_norm_eps
184
  self.use_cache = use_cache
185
+ self.use_rope = use_rope
186
  self.rope_theta = rope_theta
187
  self.rope_scaling = rope_scaling
188
  self.attention_dropout = attention_dropout
 
194
 
195
  self.attention_bias = attention_bias
196
  self.attention_output_bias = attention_output_bias
197
+ self.gate_rank_type = gate_rank_type
198
+ self.lora_rank_gate = lora_rank_gate
199
+ self.balance_state = balance_state
200
+ self.groupnorm_att = groupnorm_att
201
+ self.use_tokenshift = use_tokenshift
202
 
203
  super().__init__(
204
  tie_word_embeddings=tie_word_embeddings,